#!/bin/bash
#
# Script to create test dataset 4: Stress test
# Duplicate-finder testing methodology v3.6
#
# Creates 200,000 files of ~20 KB to test performance
# Structure: 1000 unique files × 200 "sets" (originals + 199 copies_###) = 200,000 files
#
# Expected result:
# - 1000 groups of 200 files (file duplicates)
# - The originals/copies_### folders should NOT be considered fully identical, because unique markers are added at the end.
#

set -e

# ============================================================================
# SETTINGS
# ============================================================================

BASE_DIR="$HOME/DuplicateTest/Dataset_04_StressTest"
LOG_DIR="$BASE_DIR/logs"
LOG_FILE="$LOG_DIR/dataset_creation_$(date +%Y%m%d_%H%M%S).log"

# Parameters
UNIQUE_FILES=1000        # Number of unique files
COPIES_PER_FILE=200      # Total "sets" (originals + copies) = 200
FILE_SIZE_KB=20          # File size in KB

# Extensions
EXTENSIONS=("txt" "dat" "bin" "tmp" "log")

# ============================================================================
# FUNCTIONS
# ============================================================================

log() {
    local message="$1"
    local timestamp
    timestamp=$(date "+%Y-%m-%d %H:%M:%S")
    echo "[$timestamp] $message" | tee -a "$LOG_FILE"
}

get_random_extension() {
    local idx=$(( RANDOM % ${#EXTENSIONS[@]} ))
    echo "${EXTENSIONS[$idx]}"
}

# Copy the directory contents as a whole (rsync -> ditto -> cp)
copy_dir_contents() {
    local src_dir="$1"  # originals
    local dst_dir="$2"  # copies_XXX

    rm -rf "$dst_dir"
    mkdir -p "$dst_dir"

    if command -v rsync >/dev/null 2>&1; then
        # Copy contents from originals -> dst_dir
        rsync -a "$src_dir/" "$dst_dir/"
    elif command -v ditto >/dev/null 2>&1; then
        # ditto works well on macOS
        ditto "$src_dir" "$dst_dir"
    else
        cp -a "$src_dir/." "$dst_dir/"
    fi
}

# Small unique marker inside a directory so folders are not considered identical
create_folder_marker() {
    local dir="$1"
    local label="$2"

    local ts rand marker
    ts=$(date +%Y%m%d_%H%M%S)
    rand="${RANDOM}_$RANDOM"
    marker="$dir/__folder_unique_${label}_${ts}_${rand}.txt"

    {
        echo "UNIQUE FOLDER MARKER"
        echo "label=$label"
        echo "timestamp=$ts"
        echo "random=$rand"
        echo -n "payload="
        head -c 128 /dev/urandom | od -An -tx1 | tr -d ' \n'
        echo
    } > "$marker"

    log "Added folder marker: $(basename "$marker") to $(basename "$dir")"
}

# ============================================================================
# MAIN SCRIPT
# ============================================================================

echo "============================================================"
echo "  Creating test dataset 4: Stress test"
echo "  Methodology v3.6"
echo "============================================================"
echo ""
echo "WARNING: This script will create ~4 GB of data!"
echo "         Make sure you have enough free disk space."
echo ""

# Create directories
mkdir -p "$BASE_DIR/originals"
mkdir -p "$LOG_DIR"

log "Starting creation of test dataset 4 (stress test)"
log "Unique files: $UNIQUE_FILES"
log "Copies per file (including originals): $COPIES_PER_FILE"
log "Total files: $((UNIQUE_FILES * COPIES_PER_FILE))"
log "File size: ${FILE_SIZE_KB} KB"

# Approximate size estimate (GB in decimal sense, very rough)
approx_gb=$((UNIQUE_FILES * COPIES_PER_FILE * FILE_SIZE_KB / 1024 / 1024))
log "Expected dataset size (approx.): ~${approx_gb} GB"

# Counters
total_files=0
start_time=$(date +%s)

# ============================================================================
# STAGE 1/2: CREATE ORIGINALS
# ============================================================================

echo ""
log "Stage 1/2: Creating $UNIQUE_FILES unique files..."
echo ""

for i in $(seq 1 $UNIQUE_FILES); do
    ext=$(get_random_extension)
    filename=$(printf "stress_%06d.%s" $i $ext)
    filepath="$BASE_DIR/originals/$filename"

    if (( i % 100 == 0 )); then
        echo "  Originals: $i / $UNIQUE_FILES..."
    fi

    dd if=/dev/urandom of="$filepath" bs=1024 count="$FILE_SIZE_KB" 2>/dev/null
    ((total_files++))
done

log "Originals created: $UNIQUE_FILES"

# ============================================================================
# STAGE 2/2: CREATE COPIES (as whole folders copied from originals)
# ============================================================================

echo ""
log "Stage 2/2: Creating copy folders (from originals)..."
echo ""

# Need COPIES_PER_FILE-1 copy folders: 199 folders + originals = 200 "sets"
for copy_idx in $(seq 1 $((COPIES_PER_FILE - 1))); do
    folder=$(printf "copies_%03d" $copy_idx)
    dst="$BASE_DIR/$folder"

    if (( copy_idx % 10 == 0 )); then
        elapsed=$(($(date +%s) - start_time))
        echo "  Copying into $folder... (elapsed ${elapsed} sec)"
    fi

    copy_dir_contents "$BASE_DIR/originals" "$dst"
    total_files=$(( total_files + UNIQUE_FILES ))
done

log "Copies created: $((UNIQUE_FILES * (COPIES_PER_FILE - 1)))"

# ============================================================================
# MARKERS AT THE END: so folders are not considered fully identical
# ============================================================================

echo ""
log "Adding unique markers to each folder (originals + copies_###)..."

create_folder_marker "$BASE_DIR/originals" "originals"

for copy_idx in $(seq 1 $((COPIES_PER_FILE - 1))); do
    folder=$(printf "copies_%03d" $copy_idx)
    create_folder_marker "$BASE_DIR/$folder" "$folder"
done

# ============================================================================
# FINAL REPORT
# ============================================================================

end_time=$(date +%s)
duration=$((end_time - start_time))

echo ""
echo "============================================================"
echo "  FINAL REPORT"
echo "============================================================"

log ""
log "============================================================"
log "CREATION COMPLETE"
log "============================================================"
log ""
log "Creation time: $duration seconds"
log "Total files created: $total_files"
log ""
log "EXPECTED TEST RESULT (files):"
log "  Duplicate groups: $UNIQUE_FILES"
log "  Files per group: $COPIES_PER_FILE"
log "  Total files: $total_files"
log ""
log "NOTE (folders):"
log "  One unique marker was added to each folder at the end,"
log "  so originals and copies_### should NOT be considered fully identical folders."
log ""

echo ""
echo "Dataset structure:"
echo "  $BASE_DIR/"
echo "  ├── originals/       ($UNIQUE_FILES originals + 1 marker)"
echo "  ├── copies_001/      ($UNIQUE_FILES copies + 1 marker)"
echo "  │   ..."
echo "  ├── copies_199/      ($UNIQUE_FILES copies + 1 marker)"
echo "  └── logs/"
echo ""
echo "Creation time: $duration seconds"
echo ""

actual_size=$(du -sh "$BASE_DIR" 2>/dev/null | cut -f1)

cat > "$BASE_DIR/dataset_info.txt" << EOF
Test dataset 4: Stress test
===========================
Creation date: $(date "+%Y-%m-%d %H:%M:%S")
Creation time: $duration seconds
Methodology: v3.6

Parameters:
- Unique files: $UNIQUE_FILES
- Copies per file (including originals): $COPIES_PER_FILE
- File size: ~${FILE_SIZE_KB} KB
- Formats: ${EXTENSIONS[*]}

Dataset size: $actual_size

Expected result (files):
- Duplicate groups: $UNIQUE_FILES
- Files per group: $COPIES_PER_FILE
- Total files: $total_files

Note (folders):
- One unique marker was added at the end to originals and each copies_###,
  so folders should NOT be considered fully identical.

For scanning: $BASE_DIR
After the test is finished, delete the folder: "$BASE_DIR"
EOF

log "Metadata file created: $BASE_DIR/dataset_info.txt"
log "Dataset size: $actual_size"

echo ""
echo "For scanning, select the folder:"
echo "  $BASE_DIR"
echo ""
echo "Done!"
