# Tweaking these... max_lines_of_data=100000 # The total number of lines of data to search for unique values in. Bigger values have lomger run times. max_line_length=10 # How many bytes are in each line that may be unique. Smaller values create fewer unique lines. # leads to... interval_between_lines_of_data=$(( $max_lines_of_data / 5 )) # Report heading. echo lines,percent unique,algorithm,time # Benchmark with increasing lines of data... for data_lines in $( seq 1 $interval_between_lines_of_data $max_lines_of_data ) ; do # Benchmark with increasing line length.. for line_length in $( seq $max_line_length ) ; do # Generate a certain number of lines of random data of a certain length tr -cd '[[:alnum:]]' < /dev/urandom | fold -bw "$line_length" | head -n "$data_lines" > /tmp/data # Calculate the percentage of lines that are unique. number_of_unique_lines="$( sort -u /tmp/data | wc -l )" percent_unique="$( calc -dp "$number_of_unique_lines/$data_lines" )" # Benchmark sort with its unique option. /usr/bin/time -o /tmp/sort_time -f %e sort -u /tmp/data > /dev/null echo "$data_lines,$percent_unique,sort,$( cat /tmp/sort_time )" # Benchmark awk hashing. /usr/bin/time -o /tmp/hash_time -f %e awk '!seen[$0]++' /tmp/data > /dev/null echo "$data_lines,$percent_unique,hash,$( cat /tmp/hash_time )" done done