#!/bin/bash
#
# Script to create test dataset 3: Photos
# Duplicate-finder testing methodology v3.4
#
# Creates 5000 "photo" files:
#   - Folder A: 2500 files + their APFS clones (Cmd+D equivalent)
#   - Folder B: 2500 files + copies in subfolder C
#
# Expected result: 5000 groups of 2 files
#

set -e

# ============================================================================
# SETTINGS
# ============================================================================

BASE_DIR="$HOME/DuplicateTest/Dataset_03_Photos"
LOG_DIR="$BASE_DIR/logs"
LOG_FILE="$LOG_DIR/dataset_creation_$(date +%Y%m%d_%H%M%S).log"

# Number of files in each folder
FILES_PER_FOLDER=2500

# File size (in KB) - JPEG imitation
FILE_SIZE_KB=500

# ============================================================================
# FUNCTIONS
# ============================================================================

log() {
    local message="$1"
    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
    echo "[$timestamp] $message" | tee -a "$LOG_FILE"
}

create_photo_file() {
    local filepath="$1"
    local size_kb="$2"

    # Create a file with pseudo-random content
    dd if=/dev/urandom of="$filepath" bs=1024 count="$size_kb" 2>/dev/null
}

# ============================================================================
# MAIN SCRIPT
# ============================================================================

echo "============================================================"
echo "  Creating test dataset 3: Photos"
echo "  Methodology v3.4"
echo "============================================================"
echo ""

# Create directories
mkdir -p "$BASE_DIR/A"
mkdir -p "$BASE_DIR/B/C"
mkdir -p "$LOG_DIR"

log "Starting creation of test dataset 3"
log "Base directory: $BASE_DIR"
log "Files in folder A: $FILES_PER_FOLDER (+ clones)"
log "Files in folder B: $FILES_PER_FOLDER (+ copies in C)"

# Counters
total_files=0
total_groups=0

# ============================================================================
# FOLDER A: Originals + APFS clones
# ============================================================================

echo ""
log "Creating files in folder A (with APFS clones)..."
echo ""

for i in $(seq 1 $FILES_PER_FOLDER); do
    filename=$(printf "photo_A_%05d.jpg" $i)
    original="$BASE_DIR/A/$filename"

    # Show progress every 100 files
    if (( i % 100 == 0 )); then
        echo "  Folder A: $i / $FILES_PER_FOLDER..."
    fi

    # Create original
    create_photo_file "$original" "$FILE_SIZE_KB"
    ((total_files++))

    # Create APFS clone (cp -c creates a clone on APFS)
    clone="${original%.jpg} copy.jpg"
    cp -c "$original" "$clone" 2>/dev/null || cp "$original" "$clone"
    ((total_files++))
    ((total_groups++))
done

log "Folder A: created $FILES_PER_FOLDER originals + $FILES_PER_FOLDER clones"

# ============================================================================
# FOLDER B: Originals + full copies in subfolder C
# ============================================================================

echo ""
log "Creating files in folder B (with copies in C)..."
echo ""

for i in $(seq 1 $FILES_PER_FOLDER); do
    filename=$(printf "photo_B_%05d.jpg" $i)
    original="$BASE_DIR/B/$filename"

    # Show progress every 100 files
    if (( i % 100 == 0 )); then
        echo "  Folder B: $i / $FILES_PER_FOLDER..."
    fi

    # Create original
    create_photo_file "$original" "$FILE_SIZE_KB"
    ((total_files++))

    # Create a full copy in subfolder C
    copy="$BASE_DIR/B/C/$filename"
    cp "$original" "$copy"
    ((total_files++))
    ((total_groups++))
done

log "Folder B: created $FILES_PER_FOLDER originals + $FILES_PER_FOLDER copies in C"

# ============================================================================
# FINAL REPORT
# ============================================================================

echo ""
echo "============================================================"
echo "  FINAL REPORT"
echo "============================================================"

log ""
log "============================================================"
log "CREATION COMPLETE"
log "============================================================"
log ""
log "Total files created: $total_files"
log "Expected duplicate groups: $total_groups"
log ""
log "Structure:"
log "  $BASE_DIR/"
log "  ├── A/                    ($FILES_PER_FOLDER originals)"
log "  │   └── *copy.jpg         ($FILES_PER_FOLDER APFS clones)"
log "  ├── B/                    ($FILES_PER_FOLDER originals)"
log "  │   └── C/                ($FILES_PER_FOLDER full copies)"
log "  └── logs/"
log ""
log "IMPORTANT:"
log "  - In folder A, files are APFS clones (deleting does NOT free space)"
log "  - In folder B/C, files are full copies (deleting frees space)"
log ""

echo ""
echo "Dataset structure:"
echo "  $BASE_DIR/"
echo "  ├── A/           (2500 photos + 2500 APFS clones)"
echo "  ├── B/           (2500 photos)"
echo "  │   └── C/       (2500 copies)"
echo "  └── logs/"
echo ""

# Create metadata file
cat > "$BASE_DIR/dataset_info.txt" << EOF
Test dataset 3: Photos
======================
Creation date: $(date "+%Y-%m-%d %H:%M:%S")
Methodology: v3.4

Parameters:
- Files in folder A: $FILES_PER_FOLDER originals + $FILES_PER_FOLDER APFS clones
- Files in folder B: $FILES_PER_FOLDER originals
- Files in folder B/C: $FILES_PER_FOLDER copies
- File size: ~${FILE_SIZE_KB} KB

Structure:
- A/ — originals + APFS clones (cp -c)
- B/ — originals
- B/C/ — full copies

Expected result:
- Duplicate groups: $total_groups
- Files per group: 2
- Total files: $total_files

IMPORTANT for testing:
- Folder A contains APFS clones — deleting does NOT free space
- Folder B/C contains full copies — deleting frees space
- A good duplicate finder should distinguish these cases

For scanning, select: $BASE_DIR
After the test is finished, delete the folder: "$BASE_DIR"
EOF

log "Metadata file created: $BASE_DIR/dataset_info.txt"

echo ""
echo "For scanning, select the folder:"
echo "  $BASE_DIR"
echo ""
echo "Done!"
