Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 461 lines (319 sloc) 11.1 KB
#!/usr/bin/env bash
#
# american fuzzy lop - corpus minimization tool
# ---------------------------------------------
#
# Written and maintained by Michal Zalewski <lcamtuf@google.com>
#
# Copyright 2014, 2015 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# This tool tries to find the smallest subset of files in the input directory
# that still trigger the full range of instrumentation data points seen in
# the starting corpus. This has two uses:
#
# - Screening large corpora of input files before using them as a seed for
# afl-fuzz. The tool will remove functionally redundant files and likely
# leave you with a much smaller set.
#
# (In this case, you probably also want to consider running afl-tmin on
# the individual files later on to reduce their size.)
#
# - Minimizing the corpus generated organically by afl-fuzz, perhaps when
# planning to feed it to more resource-intensive tools. The tool achieves
# this by removing all entries that used to trigger unique behaviors in the
# past, but have been made obsolete by later finds.
#
# Note that the tool doesn't modify the files themselves. For that, you want
# afl-tmin.
#
# This script must use bash because other shells may have hardcoded limits on
# array sizes.
#
echo "corpus minimization tool for afl-fuzz by <lcamtuf@google.com>"
echo
#########
# SETUP #
#########
# Process command-line options...
MEM_LIMIT=100
TIMEOUT=none
unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \
AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE
while getopts "+i:o:f:m:t:eQC" opt; do
case "$opt" in
"i")
IN_DIR="$OPTARG"
;;
"o")
OUT_DIR="$OPTARG"
;;
"f")
STDIN_FILE="$OPTARG"
;;
"m")
MEM_LIMIT="$OPTARG"
MEM_LIMIT_GIVEN=1
;;
"t")
TIMEOUT="$OPTARG"
;;
"e")
EXTRA_PAR="$EXTRA_PAR -e"
;;
"C")
export AFL_CMIN_CRASHES_ONLY=1
;;
"Q")
EXTRA_PAR="$EXTRA_PAR -Q"
test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250
QEMU_MODE=1
;;
"?")
exit 1
;;
esac
done
shift $((OPTIND-1))
TARGET_BIN="$1"
if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
cat 1>&2 <<_EOF_
Usage: $0 [ options ] -- /path/to/target_app [ ... ]
Required parameters:
-i dir - input directory with the starting corpus
-o dir - output directory for minimized files
Execution control settings:
-f file - location read by the fuzzed program (stdin)
-m megs - memory limit for child process ($MEM_LIMIT MB)
-t msec - run time limit for child process (none)
-Q - use binary-only instrumentation (QEMU mode)
Minimization settings:
-C - keep crashing inputs, reject everything else
-e - solve for edge coverage only, ignore hit counts
For additional tips, please consult docs/README.
_EOF_
exit 1
fi
# Do a sanity check to discourage the use of /tmp, since we can't really
# handle this safely from a shell script.
if [ "$AFL_ALLOW_TMP" = "" ]; then
echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
T1="$?"
echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
T2="$?"
echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
T3="$?"
echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
T4="$?"
echo "$PWD" | grep -qE '^(/var)?/tmp/'
T5="$?"
if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2
exit 1
fi
fi
# If @@ is specified, but there's no -f, let's come up with a temporary input
# file name.
TRACE_DIR="$OUT_DIR/.traces"
if [ "$STDIN_FILE" = "" ]; then
if echo "$*" | grep -qF '@@'; then
STDIN_FILE="$TRACE_DIR/.cur_input"
fi
fi
# Check for obvious errors.
if [ ! "$MEM_LIMIT" = "none" ]; then
if [ "$MEM_LIMIT" -lt "5" ]; then
echo "[-] Error: dangerously low memory limit." 1>&2
exit 1
fi
fi
if [ ! "$TIMEOUT" = "none" ]; then
if [ "$TIMEOUT" -lt "10" ]; then
echo "[-] Error: dangerously low timeout." 1>&2
exit 1
fi
fi
if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
TNEW="`which "$TARGET_BIN" 2>/dev/null`"
if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
exit 1
fi
TARGET_BIN="$TNEW"
fi
if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" ]; then
if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
exit 1
fi
fi
if [ ! -d "$IN_DIR" ]; then
echo "[-] Error: directory '$IN_DIR' not found." 1>&2
exit 1
fi
test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
rm -rf "$TRACE_DIR" 2>/dev/null
rmdir "$OUT_DIR" 2>/dev/null
if [ -d "$OUT_DIR" ]; then
echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
exit 1
fi
mkdir -m 700 -p "$TRACE_DIR" || exit 1
if [ ! "$STDIN_FILE" = "" ]; then
rm -f "$STDIN_FILE" || exit 1
touch "$STDIN_FILE" || exit 1
fi
if [ "$AFL_PATH" = "" ]; then
SHOWMAP="${0%/afl-cmin}/afl-showmap"
else
SHOWMAP="$AFL_PATH/afl-showmap"
fi
if [ ! -x "$SHOWMAP" ]; then
echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
rm -rf "$TRACE_DIR"
exit 1
fi
IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
if [ "$IN_COUNT" = "0" ]; then
echo "[+] Hmm, no inputs in the target directory. Nothing to be done."
rm -rf "$TRACE_DIR"
exit 1
fi
FIRST_FILE=`ls "$IN_DIR" | head -1`
# Make sure that we're not dealing with a directory.
if [ -d "$IN_DIR/$FIRST_FILE" ]; then
echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
rm -rf "$TRACE_DIR"
exit 1
fi
# Check for the more efficient way to copy files...
if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
CP_TOOL=ln
else
CP_TOOL=cp
fi
# Make sure that we can actually get anything out of afl-showmap before we
# waste too much time.
echo "[*] Testing the target binary..."
if [ "$STDIN_FILE" = "" ]; then
AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
else
cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
fi
FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
if [ "$FIRST_COUNT" -gt "0" ]; then
echo "[+] OK, $FIRST_COUNT tuples recorded."
else
echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
exit 1
fi
# Let's roll!
#############################
# STEP 1: COLLECTING TRACES #
#############################
echo "[*] Obtaining traces for input files in '$IN_DIR'..."
(
CUR=0
if [ "$STDIN_FILE" = "" ]; then
while read -r fn; do
CUR=$((CUR+1))
printf "\\r Processing file $CUR/$IN_COUNT... "
"$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
done < <(ls "$IN_DIR")
else
while read -r fn; do
CUR=$((CUR+1))
printf "\\r Processing file $CUR/$IN_COUNT... "
cp "$IN_DIR/$fn" "$STDIN_FILE"
"$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null
done < <(ls "$IN_DIR")
fi
)
echo
##########################
# STEP 2: SORTING TUPLES #
##########################
# With this out of the way, we sort all tuples by popularity across all
# datasets. The reasoning here is that we won't be able to avoid the files
# that trigger unique tuples anyway, so we will want to start with them and
# see what's left.
echo "[*] Sorting trace sets (this may take a while)..."
ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
sort | uniq -c | sort -n >"$TRACE_DIR/.all_uniq"
TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
#####################################
# STEP 3: SELECTING CANDIDATE FILES #
#####################################
# The next step is to find the best candidate for each tuple. The "best"
# part is understood simply as the smallest input that includes a particular
# tuple in its trace. Empirical evidence suggests that this produces smaller
# datasets than more involved algorithms that could be still pulled off in
# a shell script.
echo "[*] Finding best candidates for each tuple..."
CUR=0
while read -r fn; do
CUR=$((CUR+1))
printf "\\r Processing file $CUR/$IN_COUNT... "
sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
done < <(ls -rS "$IN_DIR")
echo
##############################
# STEP 4: LOADING CANDIDATES #
##############################
# At this point, we have a file of tuple-file pairs, sorted by file size
# in ascending order (as a consequence of ls -rS). By doing sort keyed
# only by tuple (-k 1,1) and configured to output only the first line for
# every key (-s -u), we end up with the smallest file for each tuple.
echo "[*] Sorting candidate list (be patient)..."
sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
exit 1
fi
# The sed command converted the sorted list to a shell script that populates
# BEST_FILE[tuple]="fname". Let's load that!
. "$TRACE_DIR/.candidate_script"
##########################
# STEP 5: WRITING OUTPUT #
##########################
# The final trick is to grab the top pick for each tuple, unless said tuple is
# already set due to the inclusion of an earlier candidate; and then put all
# tuples associated with the newly-added file to the "already have" list. The
# loop works from least popular tuples and toward the most common ones.
echo "[*] Processing candidates and writing output files..."
CUR=0
touch "$TRACE_DIR/.already_have"
while read -r cnt tuple; do
CUR=$((CUR+1))
printf "\\r Processing tuple $CUR/$TUPLE_COUNT... "
# If we already have this tuple, skip it.
grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
FN=${BEST_FILE[tuple]}
$CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
if [ "$((CUR % 5))" = "0" ]; then
sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
else
cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
fi
done <"$TRACE_DIR/.all_uniq"
echo
OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
if [ "$OUT_COUNT" = "1" ]; then
echo "[!] WARNING: All test cases had the same traces, check syntax!"
fi
echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
echo
test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
exit 0