#!/usr/bin/perl -w # # duplocForCPP.pl - detect duplicated lines of code (algorithm only) # # Takes source code (or other) files and collects all linenumbers of # lines equal to each other within these files. The algorithm is linear # (in space and time) in the number of input lines. # Removes C++ comments (Attention: gets confused if comment signs are # found in literal strings!) # # Synopsis: duplocForCPP.pl filename [filename, ...] # # Filtering options have to be set in the code below. # # Output: Lists of CloneClasses, # each detailling the copied fragment and the locations # where the fragment is found. # ########################################################################### # Author: Matthias Rieger # History: - Feb 25, 2006 # added minimal length in characters, plus some report niceties ########################################################################### # $Id: simpleDude.pl,v 2006/11/15 13:00:28 rieger Exp $ ########################################################################### # Options to be set by user $slidingWindowSize = 10; # number of lines per comparison $removeKeywords = 0; # if 1, keywords from list below are removed $equivalenceClassMinimalSize = 2; # how many copies until we report the clone? $fragmentMinimalCharSize = 50; # fragment size which we report # strings to be removed from the code, pre-comparison @keywords = qw(if then else for { } ); # lines which will be skipped @unwantedLines = qw(else return return; return result; }else{ { } ; ); @unwantedLines = qw(); # empty list cancels line skipping ####################################################################### # Nothing to be changed below this line $keywordsRegEx = join '|', @keywords; push @unwantedLines, @keywords; @unwantedLines{@unwantedLines} = (1) x @unwantedLines; $totalLines = $emptyLines = $codeLines = 0; @currentLines = @currentLineNos = %eqLines = (); $inComment = 0; #$startTime = (times)[0]; # go over all input files while(<>) { chomp; $totalLines++; # remove comments of type /* */ my $codeOnly = ''; while(($inComment && m|\*/|) || (!$inComment && m|/\*|)) { unless($inComment) { $codeOnly .= $` } $inComment = !$inComment; $_ = $'; } $codeOnly .= $_ unless($inComment); $_ = $codeOnly; s|//.*$||; # remove comments of type // s/\s+//g; # remove white space s/$keywordsRegEx//og if($removeKeywords); # remove keywords # skip empty and unwanted lines next if((! $_ && $emptyLines++) || (defined $unwantedLines{$_} && $codeLines++) ); $codeLines++; push @currentLines, $_; push @currentLineNos, $.; if($slidingWindowSize < @currentLines) { shift @currentLines; shift @currentLineNos; } my $lineToBeCompared = join '', @currentLines; my $lineNumbersCompared = "<$ARGV>"; # prepend filename # zero padding for linenumbers to enable lexical sorting later on $lineNumbersCompared .= join '/', map {sprintf "%07d",$_} @currentLineNos; if($bucketRef = $eqLines{$lineToBeCompared}) { push @$bucketRef, $lineNumbersCompared; } else { $eqLines{$lineToBeCompared} = [ $lineNumbersCompared ]; } if(eof) { close ARGV; # Reset linenumber count for next file $inComment = 0; # Reset indicator for /* */ comments (just to make sure) } } #$processingTime = (times)[0] - $startTime; # print the equivalence classes $numOfMarkedEquivClasses = 0; $numOfMarkedFragments = 0; foreach my $samelines (sort {length $a <=> length $b} keys %eqLines) { my @locations = @{$eqLines{$samelines}}; if(scalar @locations >= $equivalenceClassMinimalSize && length $samelines >= $fragmentMinimalCharSize ) { $numOfMarkedEquivClasses++; $numOfMarkedFragments += scalar @locations; print "------\nCloneClass \#$numOfMarkedEquivClasses: @{[scalar @locations]} Members\n"; my $pos=0; # format concatenated lines in a somewhat nice way foreach (map {$_.=';' } split /;/, $samelines) { print $pos++==0?" Code: $_\n":" $_\n"; } $pos=0; foreach (sort @locations) { s!(>|/)0+!$1!g; # remove zero-padding after we're done sorting print $pos++==0?" Loc.: $_\n":" $_\n"; } } } print "\n\n\n"; printf "Processed: %7d lines\n",$totalLines; printf "Code: %7d lines\n",$codeLines; printf "Empty/Comment: %7d lines\n",$emptyLines; #printf "Scanning time: %7.2f sec (%.f lines/sec)\n",$processingTime,$codeLines/$processingTime; print "--------------------------------------------\n"; printf "Sliding window size: %5d lines\n",$slidingWindowSize; printf "Equiv-class lower bound: %5d members\n",$equivalenceClassMinimalSize; printf "Fragment size lower bound: %5d chars\n",$fragmentMinimalCharSize; printf "Total equivalence classes: %5d\n",scalar keys %eqLines; printf "Reported equivalence classes: %5d\n",$numOfMarkedEquivClasses; printf "Reported Fragments: %5d\n",$numOfMarkedFragments;