#!/usr/bin/perl
use warnings;
use strict;
#use Devel::Profiler; # This, rather impressively, manages to confuse XML::Simple's constructor
use XML::Simple qw(:strict);

my $xp = XML::Simple->new(KeyAttr => {page => 'title'}, ForceArray => 1);

my $one = $ARGV[0];
my $two = $ARGV[1];
die "Provide two XML summary filenames based on exactly the same data, but with different string heuristics; 'actual' first.\n" unless(defined $one and defined $two);

print STDERR "Loading $one...\n";
my $xml1 = $xp->XMLin($one)->{'page'};
print STDERR "Loading $two...\n";
my $xml2 = $xp->XMLin($two)->{'page'};

# Structure: 'page' => { 'AmericanSamoa' => { 'change' => [ { 'diffmag' => '10781',

my($samples, $miscats, $meanerr, $infs, %maxover, %maxunder, %maxovererr, %maxundererr) = (0, 0, 0, 0);

print "[num]\tActual\t%\tSampled\t%\tSize\tError\n";

page: foreach my $page (keys %$xml1) {
	print "  $page\n";
	my $changes1 = $xml1->{$page}->{'change'};
	my $changes2 = $xml2->{$page}->{'change'};
	unless(defined $changes1 and defined $changes2) { warn "DATA MISMATCH for $page\n"; next; }
#	unless(ref $changes1 eq 'ARRAY') {
#		warn "Weird data structure!\n";
#		use Data::Dumper; print STDERR Dumper($changes1);
#		next;
#	}
	my $changecount = scalar @$changes1;
	for(my $cnum = 0; $cnum < $changecount; $cnum++) {
		next if $changes1->[$cnum]->{'abuse'};
		my $actual  = $changes1->[$cnum]->{'diffmag'};
		my $sampled = $changes2->[$cnum]->{'diffmag'};
		unless(defined $actual and defined $sampled) { warn "DATA MISMATCH for $page change $cnum\n"; next page; }
		unless($changes1->[$cnum]->{'revno'} == $changes2->[$cnum]->{'revno'}) { warn "DATA MISMATCH for $page change $cnum by revno\n"; next page; }

		my $diff = $sampled - $actual;
		my $error = ($actual == 0) ? ($sampled == 0 ? 0 : 'inf') :
			(abs($diff) / $actual);
		my $size = $changes2->[$cnum]->{'newsize'}; # Read from second as first is probably old lev and won't have it
		# Error when treating values as proportions of the article size (same thing! uniform scale, numpty!)
#		my $properror = ($size == 0) ? 0 : # No size -> no error possible
#			(($actual == 0) ? ($sampled == 0 ? 0 : 'inf') :
#				(abs(($sampled / $size) - ($actual / $size)) / ($actual / $size)));
		my $propact = $size == 0 ? 0 : $actual  / $size;
		my $propsam = $size == 0 ? 0 : $sampled / $size;
		my $miscat = ($propact <= 0.05 and $propsam > 0.05);
		$miscats++ if $miscat;

		if($error > 0.5 or $miscat) {
			print "[$cnum]\t$actual\t";
			printf("%7.3f", $size == 0 ? 0 : $actual  / $size);
			print "\t$sampled\t";
			printf("%7.3f", $size == 0 ? 0 : $sampled / $size);
			print "\t$size\t";
			printf("%7.3f ", $error);
			print '*' x ($error / 10);
			print ' mc' if $miscat;
			print "\n";
		}

		if($error eq 'inf') {
			$infs++;
			$samples++;
		} else {
			$meanerr = ($meanerr * $samples + $error) / ($samples + 1);
			$samples++;
			if($diff > 0) {
				testmax(\%maxover,    $diff,  $page, $cnum);
				testmax(\%maxovererr, $error, $page, $cnum);
			} else {
				testmax(\%maxunder,    -$diff, $page, $cnum);
				testmax(\%maxundererr, $error, $page, $cnum);
			}
		}
	}
}

print  "Samples              : $samples\n";
printf "Mean error           : %.4f\n", $meanerr;
print  "Miscategorisations   : $miscats\n";
printf "Maximum overestimate : %s / %s\n", strmax(\%maxovererr),  strmaxint(\%maxover);
printf "Maximum underestimate: %s / %s\n", strmax(\%maxundererr), strmaxint(\%maxunder);
printf "Infinite error count : $infs\n";

exit 0;

sub testmax { my ($max, $test, $page, $cnum) = @_;
	unless(defined $max->{'val'}) {
		$max->{'val'} = 0;
		$max->{'page'} = '';
		$max->{'cnum'} = 0;
	}
	if($test > $max->{'val'}) {
		$max->{'val'} = $test;
		$max->{'page'} = $page;
		$max->{'cnum'} = $cnum;
	}
}

sub strmax { my ($max) = @_;
	return sprintf("%.4f", $max->{'val'}).' ('.$max->{'page'}.'['.$max->{'cnum'}.'])';
}

sub strmaxint { my ($max) = @_;
	return $max->{'val'}.' ('.$max->{'page'}.'['.$max->{'cnum'}.'])';
}

