#!/bin/bash
#
# Script to create test dataset 5: Deep nesting
# Duplicate-finder testing methodology v3.4
#
# Creates 1000 files with nesting depth up to 15 levels
# Includes hidden folders at different levels
#
# Expected result: 100 groups of 10 files = 1000 files
#

set -e

# ============================================================================
# SETTINGS
# ============================================================================

BASE_DIR="$HOME/DuplicateTest/Dataset_05_DeepNesting"
LOG_DIR="$BASE_DIR/logs"
LOG_FILE="$LOG_DIR/dataset_creation_$(date +%Y%m%d_%H%M%S).log"

# Parameters
UNIQUE_FILES=100         # Number of unique files
COPIES_PER_FILE=10       # Copies of each file (at different depths)
MAX_DEPTH=15             # Maximum nesting depth
FILE_SIZE_KB=50          # File size in KB

# Extensions
EXTENSIONS=("txt" "pdf" "jpg" "png" "docx" "mp4" "zip" "json" "xml" "html")

# ============================================================================
# FUNCTIONS
# ============================================================================

log() {
    local message="$1"
    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
    echo "[$timestamp] $message" | tee -a "$LOG_FILE"
}

get_random_extension() {
    local idx=$(( RANDOM % ${#EXTENSIONS[@]} ))
    echo "${EXTENSIONS[$idx]}"
}

# Creates a random path of the given depth
create_nested_path() {
    local depth="$1"
    local include_hidden="$2"
    local path=""

    local folders=("docs" "files" "data" "archive" "backup" "temp" "cache" "old" "new" "misc")
    local hidden_folders=(".hidden" ".cache" ".temp" ".backup" ".archive")

    for (( d=1; d<=depth; d++ )); do
        # Add hidden folders at some levels
        if [[ "$include_hidden" == "true" ]] && (( d == 3 || d == 7 || d == 12 )); then
            local idx=$(( RANDOM % ${#hidden_folders[@]} ))
            folder="${hidden_folders[$idx]}_$d"
        else
            local idx=$(( RANDOM % ${#folders[@]} ))
            folder="${folders[$idx]}_level$d"
        fi
        path="$path/$folder"
    done

    echo "$path"
}

# ============================================================================
# MAIN SCRIPT
# ============================================================================

echo "============================================================"
echo "  Creating test dataset 5: Deep nesting"
echo "  Methodology v3.4"
echo "============================================================"
echo ""

# Create directories
mkdir -p "$BASE_DIR/originals"
mkdir -p "$LOG_DIR"

log "Starting creation of test dataset 5"
log "Unique files: $UNIQUE_FILES"
log "Copies per file: $COPIES_PER_FILE"
log "Maximum depth: $MAX_DEPTH"

# Counters
total_files=0
total_groups=0

# ============================================================================
# CREATE ORIGINALS
# ============================================================================

echo ""
log "Stage 1/2: Creating $UNIQUE_FILES unique files..."

for i in $(seq 1 $UNIQUE_FILES); do
    ext=$(get_random_extension)
    filename=$(printf "nested_%04d.%s" $i $ext)
    filepath="$BASE_DIR/originals/$filename"

    # Create file
    dd if=/dev/urandom of="$filepath" bs=1024 count="$FILE_SIZE_KB" 2>/dev/null
    ((total_files++))

    if (( i % 20 == 0 )); then
        echo "  Originals: $i / $UNIQUE_FILES..."
    fi
done

log "Originals created: $UNIQUE_FILES"

# ============================================================================
# CREATE COPIES AT DIFFERENT NESTING DEPTHS
# ============================================================================

echo ""
log "Stage 2/2: Creating copies at different nesting depths..."

file_num=0
for original in "$BASE_DIR/originals"/*; do
    if [[ -f "$original" ]]; then
        filename=$(basename "$original")
        ((file_num++))
        ((total_groups++))

        if (( file_num % 20 == 0 )); then
            echo "  Copying file $file_num / $UNIQUE_FILES..."
        fi

        # Create copies at different depths
        for copy_num in $(seq 1 $((COPIES_PER_FILE - 1))); do
            # Random depth from 1 to MAX_DEPTH
            depth=$(( (RANDOM % MAX_DEPTH) + 1 ))

            # Some copies go into hidden folders
            include_hidden="false"
            if (( copy_num % 3 == 0 )); then
                include_hidden="true"
            fi

            # Create path
            nested_path=$(create_nested_path $depth $include_hidden)
            full_path="$BASE_DIR$nested_path"

            # Create directory and copy file
            mkdir -p "$full_path"
            cp "$original" "$full_path/$filename"
            ((total_files++))
        done
    fi
done

log "Copies created: $((total_files - UNIQUE_FILES))"

# ============================================================================
# FINAL REPORT
# ============================================================================

echo ""
echo "============================================================"
echo "  FINAL REPORT"
echo "============================================================"

# Calculate actual maximum depth
max_actual_depth=$(find "$BASE_DIR" -type d | awk -F'/' '{print NF}' | sort -n | tail -1)
hidden_dirs=$(find "$BASE_DIR" -type d -name ".*" | wc -l | tr -d ' ')

log ""
log "============================================================"
log "CREATION COMPLETE"
log "============================================================"
log ""
log "Total files created: $total_files"
log "Duplicate groups: $total_groups"
log "Maximum depth reached: ~$max_actual_depth levels"
log "Hidden directories: $hidden_dirs"
log ""
log "EXPECTED RESULT:"
log "  Duplicate groups: $total_groups"
log "  Files per group: ~$COPIES_PER_FILE"
log "  Total files: $total_files"
log ""

echo ""
echo "Statistics:"
echo "  Total files: $total_files"
echo "  Duplicate groups: $total_groups"
echo "  Hidden directories: $hidden_dirs"
echo "  Maximum depth: ~$max_actual_depth"
echo ""

# Create metadata file
cat > "$BASE_DIR/dataset_info.txt" << EOF
Test dataset 5: Deep nesting
============================
Creation date: $(date "+%Y-%m-%d %H:%M:%S")
Methodology: v3.4

Parameters:
- Unique files: $UNIQUE_FILES
- Copies per file: $COPIES_PER_FILE
- File size: ~${FILE_SIZE_KB} KB
- Maximum depth: up to $MAX_DEPTH levels
- Formats: ${EXTENSIONS[*]}

Statistics:
- Total files: $total_files
- Hidden directories: $hidden_dirs

Expected result:
- Duplicate groups: $total_groups
- Files per group: ~$COPIES_PER_FILE

Test goal:
- Verify handling of deep nesting (15+ levels)
- Verify detection of files inside hidden folders
- Verify path correctness in results

For scanning: $BASE_DIR
After the test is finished, delete the folder: "$BASE_DIR"
EOF

log "Metadata file created: $BASE_DIR/dataset_info.txt"

echo ""
echo "For scanning, select the folder:"
echo "  $BASE_DIR"
echo ""
echo "Done!"
