#!/usr/bin/perl -w
# delta; see License.txt for copyright and terms of use

use strict;

# ****************
# Implementation of the delta debugging algorithm:
# http://www.st.cs.uni-sb.de/dd/
# Daniel S. Wilkerson dsw@cs.berkeley.edu

# Notes:

# The test script should not depend on the current directory to work.

# Note that 1-minimality does not imply idempotency, so we could
# re-run once it is stuck, perhaps with some randomization.


# Global State ****************

my @chunks = ();                # Once input, is read only.
my @markers = ();               # Delimits a dynamic subsequence of @chunks being considered.
my %test_cache = ();            # Cached test results.

# Mark boundaries that uniquely determine the marked contents.  This
# is used as a shorter key to hash on than the contents themselves.
# Since Perl hashes retain their keys if you don't do this you get a
# horrible memory leak in the test_cache.
my $mark_signature;

# End of the last marker rendered to the tmp file.  Used to figure out
# if the next one abuts it or not.
my $last_mark_stop;
my @current_markers;            # Markers to be rendered to $tmpinput if answer not in cache.

my $tmpinput;                   # Temporary file to render marked subsequence to.
my $last_successful_tmpinput;   # Last one to past the test.

my $tmp_index = 0;              # Cache the last index used to make a tmp file.
my $tmpdir_index = 0;           # Cache the last index used to make a tmp directory.
my $tmpdir;                     # Temporary directory for external programs.
my $logfile = "log";            # File in $tmpdir where log of successful runs is written.
chomp (my $this_dir = `pwd`);   # The current directory.
my $starttime = time;           # The time we started.

my $granularity = "line";       # What is the size of an input chunk?
my $dump_input = 0;             # Dump out the input after reading it in.
my $cp_minimal;                 # Copy the minimal successful test to the current dir.
my $verbose = 0;                # Be more verbose.
my $quiet = 0;                  # Prints go to /dev/null.
my $suffix = ".c";              # For now, our input files are .c files.
my $test;                       # The script to run as the test.

# when true, all operations on input file are in-place:
#   - don't make a new directory
#   - overwrite the original input file with our constructed inputs
my $in_place = 0;
my $start_file;                 # name of input/output file for in_place

my $help_message = <<"END"

    Delta version 2003.7.14
    delta implements the delta-debugging algorithm:
      http://www.st.cs.uni-sb.de/dd/
    Implemented by Daniel Wilkerson.

    usage: $0 [options] start-file

    -test=<testscript>       Specify the test script.
    -suffix=<suffix>         Candidate filename suffix [$suffix]
    -dump_input              Dump input after reading
    -cp_minimal=<filename>   Copy the minimal successful test to the
                             current directory
    -granularity=line        Use lines as the granularity (default)
    -granularity=top_form    Use C top-level forms as the granularity
                             (currently only works with CIL output)
    -log=<file>              Log file for main events
    -quiet                   Say nothing
    -verbose                 Get more verbose output
    -in_place                Overwrite start-file with inputs

    -help                    Get help

    The test program accepts a single argument, the name of the candidate
    file to test.  It is run within a directory containing only that file,
    and it can make temporary files/directories in that directory.  It
    should return zero for a candidate that exhibits the desired property,
    and nonzero for one that does not.

    Example test program (delta will retain a line containing "foo"):
      #!/bin/sh
      grep 'foo' <"\$1" >/dev/null

END
;

# Functions ****************

sub output(@) {
    print @_ unless $quiet;
}

# Return true if the current_markers pass the interesting test.
sub test {
    if (-f "DELTA-STOP") {
        output "Stopping because DELTA-STOP file exists\n";
        exit 1;
    }

    my $cached_result = $test_cache{$mark_signature};
    if (defined $cached_result) {
        output "\tcached\n";
        return $cached_result;
    }

    render_tmpinput();

    my $ret;
    my $input;
    if (!$in_place) {
      output " $tmpinput";
      my $arena = "$tmpdir/arena";
      die if system "rm -rf $arena/*";    # sm: added -r so I can make directories in the arena
      $input = "$tmpdir/$tmpinput";
      my $arena_input = "input$suffix";
      link $input, "$arena/$arena_input";
      # $test gets fully qualified in parse_command_line()
      $ret = system "cd $arena; $test $arena_input";
    }
    else {
      # for in_place, the test program is free to ignore the argument
      # (since it will be known ahead of time) but I'll pass it anyway
      $ret = system "$test $start_file";
      $input = $start_file;
    }

    # from perldoc -f system
    my $signal = $ret & 127;
    my $exitValue = $ret >> 8;

    if ($signal) {
        die "$0 exiting due to signal $signal\n";
    }
    my $result = ! $exitValue;

    # Keep around info only for successful runs.
    if ($result) {
        my $size = (split " ", `wc -l $input`)[0];
        output "\tSUCCESS, lines: $size ****************\n";
        log_msg_time("$tmpinput, lines: $size");
        if (!$in_place) {
          $last_successful_tmpinput = $tmpinput;
        }
        else {
          # make a single copy of the latest successful file
          $last_successful_tmpinput = "${start_file}.ok";
          die "cp failed" if system("cp ${start_file} $last_successful_tmpinput");
        }
    } else {
        output "\n";
        unlink $input unless $in_place;
    }
    return $test_cache{$mark_signature} = $result;
}

# given @current_markers, create a new file by writing the proper
# subset of @chunks to a file; yield its name in $tmpinput
sub render_tmpinput {
    if ($in_place) {
      # I can't just say $tmpinput = $start_file and be done with it,
      # because in many places $tmpdir/ is prefixed (and I don't want
      # to say $tmpdir="." because I want $start_file to possibly be
      # an absolute path.
      open TMPINPUT, ">$start_file" or die $!;
      $tmpinput = $start_file;
    }
    else {
      $tmpinput = unused_tempfile();
      open TMPINPUT, ">${tmpdir}/$tmpinput" or die $!;
    }
    foreach my $marker (@current_markers) {
        for (my $i=$marker->{start}; $i<$marker->{stop}; ++$i) {
            print TMPINPUT $chunks[$i];
        }
    }
    close TMPINPUT or die $!;   # NOTE: Leave $tmpinput defined.
}

sub start_marking {
    @current_markers = ();
    $mark_signature = "";
    undef $last_mark_stop;
}

sub mark {
    my ($marker) = @_;
    push @current_markers, $marker;
    if (defined $last_mark_stop) {
        if ($last_mark_stop < $marker->{start}) {
            $mark_signature .= $last_mark_stop . "]";
            $mark_signature .= "[" . $marker->{start} . ",";
        } elsif ($last_mark_stop == $marker->{start}) {
            # This marker abuts the previous one.
        } else {die}
    } else {
        $mark_signature .= "[" . $marker->{start} . ",";
    }
    $last_mark_stop = $marker->{stop};
}

sub stop_marking {
    $mark_signature .= $last_mark_stop . "]" if defined $last_mark_stop;
    output $mark_signature;
}

sub unused_tempfile {
    die unless defined $tmpdir;
    my $filename;
    do {
        $filename = sprintf("%03d", $tmp_index) . $suffix;
        $tmp_index++;
    } while -e "${tmpdir}/$filename";
    return $filename;
}

sub unused_tempdir {
  my $dirname;
  for (; $dirname = "tmp${tmpdir_index}", -e $dirname; ++$tmpdir_index) {}
  return $dirname;
}

sub select_tmpdir {
    $tmpdir = unused_tempdir() unless defined $tmpdir;
    die if -e $tmpdir;
    mkdir $tmpdir, 0777 or die $!;
    mkdir "${tmpdir}/arena", 0777 or die $!;
}

sub parse_command_line {
    my $str;
    my @non_flags = ();
    while(defined ($str = shift @ARGV)) {
        if ($str=~/^-([^=]+)(=(.+))?/) {
            my ($flag, $argument) = ($1, $3);
            if ($flag eq "help") {
                output $help_message;
                exit 0;
            } elsif ($flag eq "dump_input") {
                $dump_input++;
            } elsif ($flag eq "verbose") {
                $verbose++;
            } elsif ($flag eq "quiet") {
                $quiet++;
            } elsif ($flag eq "granularity") {
                if ($argument eq "line" || $argument eq "top_form") {
                    $granularity = $argument;
                }
            } elsif ($flag eq "cp_minimal") {
                $cp_minimal = $argument;
            } elsif ($flag eq "test") {
                $test = $argument;
            } elsif ($flag eq "suffix") {
                $suffix = $argument;
            } elsif ($flag eq "log") {
                $logfile = $argument;
            } elsif ($flag eq "in_place") {
                $in_place = 1;
            } else {die "Illegal flag: $flag \n"}
        } else {push @non_flags, $str;}
    }
    # Cleaning up.
    die "You specified both verbose and quiet." if $verbose && $quiet;
    push @ARGV, @non_flags;

    # fully qualify $test if it's not already
    die "You must specify a test script.\n" unless defined $test;
    if ($test !~ m"^/") {
      $test = "$this_dir/$test";
    }

    # sm: I like a usage string when I give no arguments but it doesn't
    # make sense to read interactively (stdin is a tty)
    if ((@ARGV == 0) && (-t STDIN)) {
      output $help_message;
      exit(0);
    }

    if ($in_place) {
      if (@ARGV != 1) {
        die "Must give exactly one explicit input file for -in_place."
      }
      $start_file = $ARGV[0];
    }
}

sub render_settings {
    my $out = "delta settings:\n";
    if (!$in_place) {
        $out .= "\ttemporary directory: $tmpdir\n";
    }
    $out .= "\tgranularity: $granularity\n";
    my $input_str;
    if (scalar @ARGV > 0) {
        $input_str = join " ", @ARGV;
    } else {
        $input_str = "<stdin>";
    }
    $out .= "\tinput: $input_str\n";
    return $out;
}

sub read_input_chunks {
    if ($granularity eq "line") {
        while (<>) {push @chunks, $_;} # Read one line at a time.
    } elsif ($granularity eq "top_form") {
        # Read chunks of C top-level forms.  I assume that any line
        # starting with '//# ' followed by a line that does not start
        # with a whitespace is a good boundary for a top-level form.
        # I'm sure you could do this in one line with the proper
        # setting to the regex that is the line seperator.
        my $chunk = "";
        my $a = <>;
        while (<>) {
            if ($a=~m|^//\# | and $_=~m|^\S|) {
                push @chunks, $chunk;
                $chunk = $a;
            } else {
                $chunk .= $a;
            }
            $a = $_;
        }
        $chunk .= $a;
        push @chunks, $chunk;
    } else {die "Illegal granularity setting: $granularity\n"}
}

sub dump_input {
  output "Dumping input ****************\n";
  if ($granularity eq "line") {
      foreach my $chunk (@chunks) {output $chunk;}
  } elsif ($granularity eq "top_form") {
      foreach my $chunk (@chunks) {output "\t-----\n", $chunk}
  } else {die "Illegal granularity setting: $granularity\n"}
  output "****************\n";
}

sub check_initial_input {
    die "The input must consist of at least one chunk." unless @chunks;
    start_marking();
    mark($markers[0]);
    stop_marking();
    die "\n\t**************** FAIL: The initial input does not pass the test.\n\n"
        unless test();
}

sub dump_markers {
  my $i = 0;
  foreach my $marker (@markers) {
    output "\t$i [", $marker->{start}, ", ", $marker->{stop}, "]\n";
    ++$i;
  }
}

sub increase_granularity {
  output "\nIncrease granularity\n";
  output "Before ";
  dump_markers();
  my @newmarkers = ();
  my $split_one = 0;
  my $half = 0;
  foreach my $marker (@markers) {

    #
    # pick a random line (this is useful if delta is repeatedly run on a minimized testcase,
    # it might still find something interesting if it changes its search sequence)
    # but adjust to be on an interesting border (again randomness helps to minimize testcases if delta-ed repeatedly)
    # [start,half-1][half,stop] will be the output chunks, I believe
    #
    my $random_number = rand();
    my $interval = $marker->{stop} - $marker->{start};
    my $found = -1;
    if ( $interval > 2) {
      $half = int($marker->{start} + $random_number * ($interval));
      if ( $half == $marker->{start} ) {
        $half = $half + 1;
      }
    } else {
      $half = int (($marker->{start} + $marker->{stop}) / 2);
    }

    # try a match on a likely module boundary
    if ( $found < 0 and rand()>0.1 ) {
      for (my $i=$half; $i>$marker->{start}; --$i) {
          if ( $chunks[$i] =~ /[ |^]module /i ) {
             # looks like an 'end module statement', we want it in the first chunk
             if ( $chunks[$i] =~ /end\smodule/i ) {
               if ( $i+1 < $marker->{stop} ) {
                  $found = $i+1;
                  last;
               }
             } else {
               $found = $i;
               last;
             }
          }
      }
      if ( $found > 0 ) {
         # output "found module boundary"
      }
    }

    # try a match on a likely subroutine boundary
    if ( $found < 0 and rand()>0.1 ) {
      for (my $i=$half; $i>$marker->{start}; --$i) {
          if ( $chunks[$i] =~ /[ |^]subroutine /i ) {
             # looks like an 'end subroutine statement', we want it in the first chunk
             if ( $chunks[$i] =~ /end\ssubroutine/i ) {
               if ( $i+1 < $marker->{stop} ) {
                  $found = $i+1;
                  last;
               } 
             } else {
               $found = $i;
               last;
             }
          }
      }
      if ( $found > 0 ) {
         # output "found sub boundary"
      }
    }

    # try a match on a likely IF/ENDIF boundary
    if ( $found < 0 and rand()>0.1 ) {
      for (my $i=$half; $i>$marker->{start}; --$i) {
          if ( $chunks[$i] =~ /if/i ) {
             # looks like an 'end if statement', we want it in the first chunk
             if ( $chunks[$i] =~ /end\sif/i ) {
               if ( $i+1 < $marker->{stop} ) {
                  $found = $i+1;
                  last;
               } 
             } else {
               $found = $i;
               last;
             }
          }
      }
      if ( $found > 0 ) {
         # output "found if boundary"
      }
    }

    # try a match on a likely DO/ENDDO boundary
    if ( $found < 0 and rand()>0.1 ) {
      for (my $i=$half; $i>$marker->{start}; --$i) {
          if ( $chunks[$i] =~ /do/i ) {
             # looks like an 'end if statement', we want it in the first chunk
             if ( $chunks[$i] =~ /end\sdo/i ) {
               if ( $i+1 < $marker->{stop} ) {
                  $found = $i+1;
                  last;
               }
             } else {
               $found = $i;
               last;
             }
          }
      }
      if ( $found > 0 ) {
         # output "found do boundary"
      }
    }

    # just replace by the found stuff if useful
    if ( $found > 0 and rand()>0.1 ) {
       $half = $found;
    }

    if ($half == $marker->{start} or $half == $marker->{stop}) {
      push @newmarkers, $marker;
    } else {
      ++$split_one;
      # output " Cutting between $chunks[$half-1] and $chunks[$half]";
      push @newmarkers, {start=>$marker->{start}, stop=>$half};
      push @newmarkers, {start=>$half, stop=>$marker->{stop}};
    }
  }
  @markers = @newmarkers;
  output "After ";
  dump_markers();
  output "\n";
  return $split_one;
}

sub dhms_from_seconds {
    my ($total_seconds) = @_;
    my $sec = $total_seconds % 60;

    my $total_minutes = ($total_seconds - $sec) / 60;
    die unless $total_minutes == (int $total_minutes);
    my $min = $total_minutes % 60;

    my $total_hours = ($total_minutes - $min) / 60;
    die unless $total_hours == (int $total_hours);
    my $hours = $total_hours % 24;

    my $days = ($total_hours - $hours) / 24;
    die unless $days == (int $days);

    return ($days, $hours, $min, $sec);
}

sub timestamp {
    my $now = time;             # Get a timestamp in seconds.
    my $elapsed = $now - $starttime; # Make relative to start time.
    my ($d,$h,$m,$s) = dhms_from_seconds($elapsed); # Convert to more familiar format.
    my $elapsed_dhms = sprintf("%02d:%02d:%02d", $h, $m, $s); # Format.
    if ($d > 0) {
        my $day_str = "$d day";
        $day_str .= "s" if $d > 1;
        $day_str .= ", ";
        $elapsed_dhms = $day_str . $elapsed_dhms;
    }
    my $timestr = scalar localtime($now); # Format as abolute.
    return sprintf("%d sec/%s\t%s", $elapsed, $elapsed_dhms, $timestr);
}

sub log_msg {
    my ($message) = @_;
    open LOG, ">>${logfile}" or die $!;
    print LOG $message, "\n";
    close LOG or die $!;
}

sub log_msg_time {
    my ($message) = @_;
    log_msg(sprintf("%-39s %s", $message, timestamp()));
}

sub done {
    output "Could not increase granularity; we are done.\n";
    output "A log of successful runs is in ${logfile}\n";
    if (defined $cp_minimal) {
        output "Copying minimal run to $cp_minimal\n";
        die "cp failed" if system "cp ${tmpdir}/${last_successful_tmpinput} $cp_minimal";
    }
    if ($in_place) {
        die "cp failed" if system("cp $last_successful_tmpinput $start_file");
    }
    log_msg_time("delta done");
    exit 0;
}

# Main ****************

parse_command_line();
select_tmpdir() unless $in_place;
if (!$in_place) {
  $logfile = "${tmpdir}/$logfile" if $logfile!~m|^/|; # Make absolute.
}
my $settings = render_settings();
log_msg($settings);
if ($verbose) {
    output "\nDelta debugging algorithm, implemented by Daniel S. Wilkerson.\n";
    output $settings, "\n";
}
log_msg_time("delta start");

read_input_chunks();
dump_input() if $dump_input;
$markers[0] = {start=>0, stop=>(scalar @chunks)}; # Initialize one marker.
check_initial_input();          # This is a vital step!  Don't omit it!

big_loop: {

    # NOTE: this paragraph is part of the strict delta algorithm, but
    # it is not actually necessary, so by default I implement
    # something that is a little different from the published
    # algorithm.  Un-comment this paragraph to have the algorithm
    # strictly as published.

    # Test the single markers.
#      foreach my $test_marker (@markers) {
#          start_marking();
#          mark($test_marker);
#          stop_marking();
#          if (test()) {
#              @markers = ($test_marker); # Get rid of all markers but this one.
#              if (increase_granularity()) {redo big_loop;}
#              else {done()}
#          }
#      }

    # Test the complements to single markers.
  complement_loop: {
        my %excluded = ();
        # Try them in reverse.  In both the above "positive" loop and
        # this "negative" loop, the things you are throwing away start
        # at the end of the data, thus the two strategies are
        # consistent.
        foreach my $excluded_marker (reverse @markers) {
            start_marking();
            foreach my $marker (@markers) {
                next if $marker eq $excluded_marker;
                next if $excluded{$marker};
                mark($marker);
            }
            stop_marking();
            if (test()) {
                die "Can't happen" if $excluded{$excluded_marker};
                $excluded{$excluded_marker}++;
            }
        }
        # If any were excluded, record this fact into @markers.
        my @excluded_keys = keys %excluded;
        if (@excluded_keys) {
            @markers = grep {!$excluded{$_}} @markers;
            redo complement_loop; # Retry at the same granularity.
        }
    }

    # None of them worked, increase the granularity.
    if (increase_granularity()) {redo big_loop;}
    else {done()}
}