#!/bin/bash

# Script to create test dataset 2: Large files
# Duplicate-finder testing methodology v3.6
# Creates 50 files sized 1–30 MB in an isolated folder
# with 3 copies of each file (copy_A, copy_B, copy_C)
# Expected result: 50 duplicate groups of 4 files

set -e

# Isolated folder for tests (methodology v3.6)
BASE_DIR="$HOME/DuplicateTest/Dataset_02_LargeFiles"

# Log folder
LOG_DIR="$BASE_DIR/logs"
LOG_FILE="$LOG_DIR/dataset_creation_$(date +%Y%m%d_%H%M%S).log"

# Copy folders inside the isolated directory
COPY_DIRS=(
    "$BASE_DIR/copy_A"
    "$BASE_DIR/copy_B"
    "$BASE_DIR/copy_C"
)

# Number of files
FILE_COUNT=50

# File sizes (in MB)
MIN_SIZE_MB=1
MAX_SIZE_MB=30

# File extensions
EXTENSIONS=("dmg" "mp4" "zip" "mov" "pkg")

# ============================================================================
# FUNCTIONS
# ============================================================================

log() {
    local message="$1"
    local timestamp
    timestamp=$(date "+%Y-%m-%d %H:%M:%S")
    echo "[$timestamp] $message" | tee -a "$LOG_FILE"
}

log_file_created() {
    local file_path="$1"
    local file_size="$2"
    local file_type="$3"
    local action="$4"
    echo "$file_path|$file_size|$file_type|$action|$(date +%Y-%m-%d_%H:%M:%S)" >> "$LOG_DIR/files_created.csv"
}

create_random_file() {
    local filepath="$1"
    local size_mb="$2"

    # Create a file with pseudo-random content
    dd if=/dev/urandom of="$filepath" bs=1m count="$size_mb" 2>/dev/null
}

get_random_size() {
    echo $(( RANDOM % (MAX_SIZE_MB - MIN_SIZE_MB + 1) + MIN_SIZE_MB ))
}

get_random_extension() {
    local idx=$(( RANDOM % ${#EXTENSIONS[@]} ))
    echo "${EXTENSIONS[$idx]}"
}

# Copy directory contents as a whole (prefer rsync, otherwise cp -a)
copy_dir_contents() {
    local src_dir="$1"   # .../originals
    local dst_dir="$2"   # .../copy_A

    # Recreate dst_dir to ensure the copy is made FROM originals
    rm -rf "$dst_dir"
    mkdir -p "$dst_dir"

    if command -v rsync >/dev/null 2>&1; then
        # Copy CONTENTS of originals -> dst_dir
        rsync -a "$src_dir/" "$dst_dir/"
    else
        # Fallback: cp -a the contents
        cp -a "$src_dir/." "$dst_dir/"
    fi
}

# Small unique marker inside a directory so folders are not considered identical
create_folder_marker() {
    local dir="$1"
    local label="$2"

    local ts rand marker
    ts=$(date +%Y%m%d_%H%M%S)
    rand="${RANDOM}_$RANDOM"
    marker="$dir/__folder_unique_${label}_${ts}_${rand}.txt"

    {
        echo "UNIQUE FOLDER MARKER"
        echo "label=$label"
        echo "timestamp=$ts"
        echo "random=$rand"
        echo -n "payload="
        head -c 128 /dev/urandom | od -An -tx1 | tr -d ' \n'
        echo
    } > "$marker"

    log "Folder marker added: $(basename "$marker") в $(basename "$dir")"
    log_file_created "$marker" "0" "txt" "folder_marker_$label"
}

# ============================================================================
# MAIN SCRIPT
# ============================================================================

echo "============================================================"
echo "  Creating test dataset 2: Large files"
echo "  Methodology v3.6 (isolated structure)"
echo "============================================================"
echo ""

# Create directories
mkdir -p "$BASE_DIR/originals"
mkdir -p "$LOG_DIR"

# Create (currently empty) copy directories
for copy_dir in "${COPY_DIRS[@]}"; do
    mkdir -p "$copy_dir"
done

# Initialize log files
log "Starting creation of test dataset 2"
log "Base directory: $BASE_DIR"
log "Number of files: $FILE_COUNT"
log "File size range: $MIN_SIZE_MB - $MAX_SIZE_MB MB"

# Create CSV header for the file log
echo "path|size_mb|extension|action|timestamp" > "$LOG_DIR/files_created.csv"

echo ""
log "Creating original files (originals)..."
echo ""

# Counters
total_files_created=0
total_size_mb=0

# Create original files
for i in $(seq 1 $FILE_COUNT); do
    size_mb=$(get_random_size)
    ext=$(get_random_extension)
    filename=$(printf "testfile_%03d_%dMB.%s" $i $size_mb $ext)
    filepath="$BASE_DIR/originals/$filename"

    echo -n "Creating file $i/$FILE_COUNT: $filename ($size_mb MB)... "

    create_random_file "$filepath" "$size_mb"

    if [[ -f "$filepath" ]]; then
        echo "OK"
        log "Original created: $filename ($size_mb MB)"
        log_file_created "$filepath" "$size_mb" "$ext" "created_original"
        ((total_files_created++))
        ((total_size_mb += size_mb))
    else
        echo "ERROR"
        log "ERROR: Failed to create $filename"
    fi
done

echo ""
log "Originals created: $total_files_created files, total size: $total_size_mb MB"
echo ""

# ============================================================================
# COPYING: whole folders (each copy_* is made directly from originals)
# ============================================================================

log "Copying originals -> copy_* (whole folders; each copy is made from originals)..."

for copy_dir in "${COPY_DIRS[@]}"; do
    copy_name=$(basename "$copy_dir")
    echo ""
    log "Copying to: $copy_dir"

    copy_dir_contents "$BASE_DIR/originals" "$copy_dir"

   # Log files to CSV (keep the previous logging behavior)
    for original in "$BASE_DIR/originals"/testfile_*; do
        if [[ -f "$original" ]]; then
            filename=$(basename "$original")
            dest_file="$copy_dir/$filename"

            # Extract size from the filename
            size_mb=$(echo "$filename" | grep -oE '[0-9]+MB' | grep -oE '[0-9]+')
            ext="${filename##*.}"
            log_file_created "$dest_file" "$size_mb" "$ext" "copied_$copy_name"
            ((total_files_created++))
        fi
    done

    log "Copied into $copy_dir: $FILE_COUNT files"
done

# ============================================================================
# MARKERS: add at the very end (after folders are ready)
# ============================================================================

echo ""
log "Adding unique markers to each of the 4 folders (so folders are not identical)..."
create_folder_marker "$BASE_DIR/originals" "originals"
for copy_dir in "${COPY_DIRS[@]}"; do
    create_folder_marker "$copy_dir" "$(basename "$copy_dir")"
done

# ============================================================================
# FINAL REPORT
# ============================================================================

echo ""
echo "============================================================"
echo "  FINAL REPORT"
echo "============================================================"

# Count locations (originals + 3 copies)
locations_count=4

expected_groups=$FILE_COUNT
expected_files_per_group=$locations_count
expected_total_files=$((FILE_COUNT * locations_count))
expected_extra_unique_files=$locations_count

log ""
log "============================================================"
log "CREATION COMPLETED"
log "============================================================"
log ""
log "Originals created: $FILE_COUNT"
log "Copies per file: 3 (copy_A, copy_B, copy_C)"
log "Total files (testfile_*) created/logged: $total_files_created"
log ""
log "EXPECTED RESULT FOR TESTING:"
log "  Duplicate groups: $expected_groups"
log "  Files per group: $expected_files_per_group"
log "  Total duplicate files (testfile_*): $expected_total_files"
log "  + Unique marker files: $expected_extra_unique_files (one per folder)"
log ""
log "NOTE:"
log "  Markers are added AFTER copying, so originals/copy_* should NOT be considered fully identical folders."
log ""
log "Log file: $LOG_FILE"
log "CSV with file list: $LOG_DIR/files_created.csv"
log ""

echo ""
echo "Test dataset structure:"
echo "  $BASE_DIR/"
echo "  ├── originals/     (50 originals + 1 marker)"
echo "  ├── copy_A/        (50 copies + 1 marker)"
echo "  ├── copy_B/        (50 copies + 1 marker)"
echo "  ├── copy_C/        (50 copies + 1 marker)"
echo "  └── logs/          (creation logs)"
echo ""

# Create a dataset metadata file
cat > "$BASE_DIR/dataset_info.txt" << EOF
Test dataset 2: Large files
================================
Creation date: $(date "+%Y-%m-%d %H:%M:%S")
Methodology: v3.6 (isolated structure)

Parameters:
- Number of originals: $FILE_COUNT
- File size range: $MIN_SIZE_MB - $MAX_SIZE_MB MB
- Formats: ${EXTENSIONS[*]}

Structure:
- originals/ — original files (testfile_*) + 1 unique marker
- copy_A/ — full copy of originals (made from originals) + 1 unique marker
- copy_B/ — full copy of originals (made from originals) + 1 unique marker
- copy_C/ — full copy of originals (made from originals) + 1 unique marker

Expected result (duplicates):
- Duplicate groups: $expected_groups
- Files per group: $expected_files_per_group
- Total duplicate files (testfile_*): $expected_total_files
Additionally:
- Unique marker files: $expected_extra_unique_files (one per folder),
  so originals/copy_* folders are NOT considered fully identical.

To scan, select the folder:
$BASE_DIR

To delete the whole dataset:
Delete "$BASE_DIR"
EOF

log "Metadata file created: $BASE_DIR/dataset_info.txt"

echo ""
echo "To scan, select the folder:"
echo "  $BASE_DIR"
echo ""
echo "Done!"
