# Tweaking these...
max_lines_of_data=100000                # The total number of lines of data to search for unique values in. Bigger values have lomger run times.
max_line_length=10                      # How many bytes are in each line that may be unique. Smaller values create fewer unique lines.

# leads to...
interval_between_lines_of_data=$(( $max_lines_of_data / 5 ))

# Report heading.
echo lines,percent unique,algorithm,time

# Benchmark with increasing lines of data...
for data_lines in $( seq 1 $interval_between_lines_of_data $max_lines_of_data ) ; do

    # Benchmark with increasing line length..
    for line_length in $( seq $max_line_length ) ; do

        # Generate a certain number of lines of random data of a certain length
        tr -cd '[[:alnum:]]' < /dev/urandom | fold -bw "$line_length" | head -n "$data_lines" > /tmp/data

        # Calculate the percentage of lines that are unique.
        number_of_unique_lines="$( sort -u /tmp/data | wc -l )"
        percent_unique="$( calc -dp "$number_of_unique_lines/$data_lines" )"

        # Benchmark sort with its unique option.
        /usr/bin/time -o /tmp/sort_time -f %e sort -u /tmp/data > /dev/null 
        echo "$data_lines,$percent_unique,sort,$( cat /tmp/sort_time )"

        # Benchmark awk hashing.
        /usr/bin/time -o /tmp/hash_time -f %e awk '!seen[$0]++' /tmp/data  > /dev/null
        echo "$data_lines,$percent_unique,hash,$( cat /tmp/hash_time )"

    done
done