#!/usr/bin/perl
use warnings;
use strict;
use CatAggr;

# Category aggregator
# These tools are Free Software, licensed under the MIT license.
# Copyright 2007, Philip Boulain. See LICENSE.TXT for details.

# Advanced warning: this script is not pretty. It's just clean enough to
# fulfill my needs and withstand a little bit of extension without fragility.
# Proper extensibility or neatness (witness the globals) can not be found here.

use constant EDIT_BUCKET_SIZE => 10;
use constant EDIT_MAG_BUCKET_SIZE => 100;

my $infix = $ARGV[0];
warn "Must provide infix to enable graph generation.\n" unless defined $infix;

# Pages seen
my $pagecount = 0;
# Mapping from users (serialised) to edit counts
my %useredits;
# Users who made certain categories of edit (mapped to val of 1, purely to get set [not bag] in keys)
my %userbycat = (
	abuse	=> {}, # Other than reverts
	revert	=> {},
	listof	=> {},
	major	=> {},
	templ	=> {},
	categ	=> {},
	plinks	=> {},
	ulinks	=> {},
);
# Totals of categories
my %totals = (
	all	=> 0,
	abuse	=> 0, # Other than reverts
	revert	=> 0,
	listof	=> 0,
	major	=> 0,
	templ	=> 0,
	categ	=> 0,
	plinks	=> 0,
	ulinks	=> 0,
	catorl  => 0, # categ \/ listof -- may have overlap, so != categ + listof
	anylnk  => 0, # plinks \/ ulinks
	alnkcn  => 0, # anylink \/ ¬major
	hyper   => 0, # catorl \/ anylnk \/ templ
	minor   => 0, # no categories
);
# Totals of categories, registered users only (can compute unreg.)
my %totalsreg = %totals;
# Totals of categories, printable names for (only these will be printed)
my %totalnames = (
	all    => 'All      ', abuse  => 'Abuse    ', revert => 'Revert   ',
	listof => 'List of  ', major  => 'Content  ', templ  => 'Template ',
	categ  => 'Category ', plinks => 'Page link', ulinks => 'URL link ',
	catorl => 'Indexing ', anylnk => 'Links    ', alnkcn => 'Link only',
	hyper  => 'Hyper    ', minor  => 'Minor    ',
);
# Edit magnitude => count
my %edmagreg;
my %edmagunreg;

{ # Scope of full data
print STDERR "Loading";
# WARNING: The '-' argument is actually ignored, as parse_uri doesn't grok it. It's hardcoded to STDIN for now.
my $pages = CatAggr->load('-')->pagehash();

print STDERR "Aggregating";
foreach my $pname (keys %$pages) {
	if(!($pagecount % 10)) { print STDERR '.'; } # "\t$pname\n"; # Rather too verbose
	$pagecount++;
	my $page = CatAggr::Page->wrap($pages->{$pname});
	foreach(@{$page->changearr()}) {
		my $change  = CatAggr::Change->wrap($_);
		my $user    = $change->byuser();
		my $abuse   = $change->abuse();
		my $diffmag = $change->diffmag();
		my $newsize = $change->newsize();
		my $revno   = $change->revno();
		my $registered = $user->registered() ? 1 : 0;

		# Count all changes
		$totals{all}++;
		$totalsreg{all} += $registered;

		# Count edits for each user
		$useredits{$user}++;

		if($abuse) {
			# Totals and user types
			my $k = ($abuse == CatAggr::Change::ABUSE_REVERT) ? 'revert' : 'abuse';
			$totals{$k}++;
			$totalsreg{$k} += $registered;
			$userbycat{$k}->{$user} = 1;
		} else {
			my $listof     = ($pname =~ /^List of /);
			my $majortext  = $change->majortext();
			my $templates  = $change->templates();
			my $categories = $change->categories();
			my $pagelinks  = $change->pagelinks();
			my $urllinks   = $change->urllinks();

			# Number of edits of this magnitude
			($registered ? \%edmagreg : \%edmagunreg)->{$diffmag}++;

			# Users and totals
			my $r = $registered;
			if($listof    ) { my $k='listof'; $totals{$k}++; $totalsreg{$k}+=$r; $userbycat{$k}->{$user}=1; }
			if($majortext ) { my $k='major' ; $totals{$k}++; $totalsreg{$k}+=$r; $userbycat{$k}->{$user}=1; }
			if($templates ) { my $k='templ' ; $totals{$k}++; $totalsreg{$k}+=$r; $userbycat{$k}->{$user}=1; }
			if($categories) { my $k='categ' ; $totals{$k}++; $totalsreg{$k}+=$r; $userbycat{$k}->{$user}=1; }
			if($pagelinks ) { my $k='plinks'; $totals{$k}++; $totalsreg{$k}+=$r; $userbycat{$k}->{$user}=1; }
			if($urllinks  ) { my $k='ulinks'; $totals{$k}++; $totalsreg{$k}+=$r; $userbycat{$k}->{$user}=1; }
			# Union totals: these aren't collected as a by-user statistic
			my $catorl = ($listof or $categories);
			my $anylnk = ($pagelinks or $urllinks);
			if($catorl) { my $k='catorl'; $totals{$k}++; $totalsreg{$k}+=$r; }
			if($anylnk) { my $k='anylnk'; $totals{$k}++; $totalsreg{$k}+=$r; }
			if($anylnk and !$majortext) { my $k='alnkcn'; $totals{$k}++; $totalsreg{$k}+=$r; }
			if($catorl or $anylnk or $templates) { my $k='hyper'; $totals{$k}++; $totalsreg{$k}+=$r; }
			if(!$listof and
			   !$majortext and
			   !$templates and
			   !$categories and
			   !$pagelinks and
			   !$urllinks) { my $k='minor'; $totals{$k}++; $totalsreg{$k}+=$r; }
			# What a load of ugly fluff. :/
		}
	}
}
print STDERR "\n\n";
} # Scope of full data

print "RAW AGGREGATIONS\n";
print "\t".$totals{all}." changes in $pagecount pages processed\n";
print "\t".(scalar keys %useredits)." unique users\n";
print "\t\t".(scalar @{CatAggr::User->allreg()}).  " of which are registered\n";
print "\t\t".(scalar @{CatAggr::User->allunreg()})." of which are unregistered\n";
print "\t".(scalar keys %{$userbycat{abuse}} )." users made abusive edits\n";
print "\t".(scalar keys %{$userbycat{revert}})." users reverted edits\n";
print "\t".(scalar keys %{$userbycat{listof}})." users edited 'List of' pages\n";
print "\t".(scalar keys %{$userbycat{major}} )." users made major text changes\n";
print "\t".(scalar keys %{$userbycat{templ}} )." users changed templates included\n";
print "\t".(scalar keys %{$userbycat{categ}} )." users changed page categories\n";
print "\t".(scalar keys %{$userbycat{plinks}})." users changed page links\n";
print "\t".(scalar keys %{$userbycat{ulinks}})." users changed URL links\n";
print "\tTotals";
foreach(sort {$totals{$a} <=> $totals{$b}} keys %totals) { print ": $_=".$totals{$_}; }
print "\n";
print "\t".(scalar keys %edmagreg  )." unique edit magnitudes from registered users\n";
print "\t".(scalar keys %edmagunreg)." unique edit magnitudes from unregistered users\n";
print "\n";

# Post-aggregation additional result mashing
if(defined $infix) { print STDERR "Generating graph data...\n";

# Count edits for each user
   # invert the %useredits hash (with buckets)
my @editbuckets;
my $genfname = sub { "ag-$infix-".(shift).'.dat'; };
my $writebuckets = sub { my ($fname, $buckets, $bucketsize) = @_;
	$fname = $genfname->($fname);
	print STDERR "\tWriting $fname...\n";
	open(GPDATA, '>', $fname) or die;
	for(my $i = 0; $i < scalar @$buckets; $i++) {
		my $val = $buckets->[$i]; # Sparse data
		$val = (defined $val) ? $val : 0;
		print GPDATA ((($i+0.5) * $bucketsize)."\t$val\n");
	}
	close GPDATA;
};
my $buckettheedits = sub { my ($fname, @keys) = @_;
	if(@keys and ((ref $keys[0]) eq 'ARRAY')) { @keys = @{$keys[0]}; }
	@editbuckets = (); # Drop all buckets
	foreach my $suser (@keys) { # (suser == serialised user)
		my $ecount = $useredits{$suser};
		die "Barfed on key '$suser'" unless defined $ecount;
		my $bucket = $ecount / EDIT_BUCKET_SIZE;
		$editbuckets[$bucket]++;
	}
	$writebuckets->($fname, \@editbuckets, EDIT_BUCKET_SIZE);
};
$buckettheedits->('editsusersall', keys %useredits);
#  Group by reg/unreg
   # invert the %useredits hash (with buckets), using User->allreg and User->allunreg as keys
$buckettheedits->('editsusersreg', CatAggr::User->allreg());
$buckettheedits->('editsusersunreg', CatAggr::User->allunreg());
#  Group by type
   # invert the %useredits hash (with buckets), using userbycat->{...} keys as keys
foreach my $type (keys %userbycat) {
	$buckettheedits->("editsuserscat$type", keys %{$userbycat{$type}});
}
# Bucket into edit magnitude
#   By reg/unreg
   @editbuckets = (); # (all)
my @editbucketsreg = ();
my @editbucketsunreg = ();
foreach my $mag (keys %edmagreg) {
	my $ecount = $edmagreg{$mag};
	my $bucket = $mag / EDIT_MAG_BUCKET_SIZE;
	$editbuckets[$bucket]    += $ecount;
	$editbucketsreg[$bucket] += $ecount;
}
foreach my $mag (keys %edmagunreg) {
	my $ecount = $edmagunreg{$mag};
	my $bucket = $mag / EDIT_MAG_BUCKET_SIZE;
	$editbuckets[$bucket]      += $ecount;
	$editbucketsunreg[$bucket] += $ecount;
}
$writebuckets->('editmagsall',   \@editbuckets,      EDIT_MAG_BUCKET_SIZE);
$writebuckets->('editmagsreg',   \@editbucketsreg,   EDIT_MAG_BUCKET_SIZE);
$writebuckets->('editmagsunreg', \@editbucketsunreg, EDIT_MAG_BUCKET_SIZE);

print STDERR "\n"; }

# Simple numerical statistics of note (with scope to prevent variable clutter)
{ print "SIMPLE NUMERICAL STATISTICS\n";
# Totals of each category (and overall); synth category "List of "
#   By reg/unreg
print "\tType     \tTotal\tReg\tUnreg\n";
foreach(sort {$totals{$a} <=> $totals{$b}} keys %totalnames) {
	my $t = $totals{$_}; my $r = $totalsreg{$_}; my $u = $t - $r;
	print "\t".($totalnames{$_})."\t$t\t$r\t$u\n";
}
print "\n";
# Fancy ratio things:
my $pclinks     = (($totals{anylnk}+0.0) / $totals{all});
my $pclinksonly = (($totals{alnkcn}+0.0) / $totals{all});
my $pccontent   = (($totals{major} +0.0) / $totals{all}); 
my $pchyper     = (($totals{hyper} +0.0) / $totals{all});
# %age of changes which are of categories
print "\tEdits which are just minor noise   : ".percent(($totals{minor} +0.0) / $totals{all})."\n";
print "\tEdits which are (re)categorisations: ".percent(($totals{categ} +0.0) / $totals{all})."\n";
print "\tEdits which are maintaining List Of: ".percent(($totals{listof}+0.0) / $totals{all})."\n";
print "\t            Either (total overhead): ".percent(($totals{catorl}+0.0) / $totals{all})."\n";
# % of changes which are of links vs % of changes which are of content
print "\tEdits which are of links           : ".percent($pclinks)."\n";
print "\tEdits which are of links, not cntnt: ".percent($pclinksonly)."\n";
print "\tEdits which are of content         : ".percent($pccontent)."\n";
print "\t   Fraction of effort links/content: ".ratio($pclinks / $pccontent)."\n";
print "\t   Fraction of eff lnk only/content: ".ratio($pclinksonly / $pccontent)."\n";
# % of changes to hyperstructure (any link, cat.) vs content (maj.); omit trans?
print "\tEdits which are of hyperstructure  : ".percent($pchyper)."\n";
print "\t   Fraction of effort hyper/content: ".ratio($pchyper / $pccontent)."\n";
}

exit 0;

sub ratio { sprintf "%7.5f", (shift); }
sub percent { sprintf "%6.2f%%", ((shift) * 100); }

