/* These tools are Free Software, licensed under the MIT license. *
 *  Copyright 2007, Philip Boulain. See LICENSE.TXT for details.  */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include <libxml/xmlwriter.h>
#include "mediawikiformat.h"
#include "mediawikimarkup.h"

/* Write out the revision number counter, which numbers the first 255 changes.
 * This is actually fairly useful for finding things in the file. */
#define DEBUG_REVNO

/* Cast from C string literal to xmlChar */
#define XMLLIT (const xmlChar*)

/* Type for holding categorisation information about a revision */
typedef struct {
	bool abuse;
	union {
		struct {
			/* Spam, profanity, etc. not detected */
			enum { ABUSE_REVERT = 0, ABUSE_BLANKING } type;
		} abu;
		struct {
			bool majortext;
			bool templates;
			bool categories;
			bool pagelinks;
			bool urllinks;
			/* Difference magnitude. Levenshtein distance from previous text. */
			size_t diffmag;
		} norm;
	} cat;
} changes;
/* enum-to-string lookup array for output */
static const char* abuse_names[] = {"revert", "blanking"};

/* Buffers of the incoming text, for manipulation and limited history. */
static upstr textbuf[3];
/* Indicies of which text buffers hold the current, previous, and
 * previous-previous revisions. */
static int textbuf_now;
#define textbuf_prev     ((textbuf_now + 1) % 3)
#define textbuf_prevprev ((textbuf_now + 2) % 3)
/* Buffers of the parsed plaintext and markup information */
static upstr parsedtextbuf[2];
static mw_markupinfo parsedmarkupbuf[2];
static int parsed_now;
#define parsed_prev ((parsed_now + 1) % 2)
/* Revision counter, so that we can tell when prevprevtext is valid/can be set.
 * To avoid overflow, this stops incrementing at 255, as we only need up to 3.*/
static uint8_t revisioncounter;
/* Number of non-abuse revisions encounted for this page; also stops early. */
static uint8_t goodrevisions;
/* Non-MW-format XML output. */
xmlTextWriterPtr xmlwriter;
/* True if have output at least one page start. */
bool in_page;

/* Fraction of an article's plaintext which has to change in order to be considered a major edit.
 * Change must be GREATER than this value. */
static double major_thresh;

bool initFilter(int argc, char** argv) {
	if(argc > 1) { return false; }

	for(int i = 0; i < 3; i++) {
		upstr_init(&textbuf[i]);
		if(i < 2) {
			upstr_init(&parsedtextbuf[i]);
			if(!mw_markupinfo_init(&parsedmarkupbuf[i])) {
				fprintf(stderr, "Not even enough memory to initialise markup buffer %d!", i + 1);
				/* CBA to _destroy the markupinfos, as we're about to bail anyway */
				return false;
			}
		}
	}
	textbuf_now = 0; /* Anything, as long as defined */
	parsed_now = 0;

	/* Average English sentence length is considered to be about ~20 words.
	 * Let us say that changing only one word in a sentence is minor; this
	 * then yeilds a sensible default of 1/20, or 5%. */
	major_thresh = 0.05;
	if(argc) {
		char* end;
		major_thresh = strtod(argv[0], &end);
		if(*(argv[0]) == '\0' || *end != '\0') { // see strtod manpage: !"entire string valid"
			fprintf(stderr, "'%s' isn't a number; should be fraction of node plaintext changed to consider major.\n", argv[0]);
			return false;
		}
		if(major_thresh < 0 || major_thresh > 1) { // Allow never or always if they really want
			fprintf(stderr, "%lf doesn't make sense as a proportion of change (expect 0--1).\n", major_thresh);
			return false;
		}
	}

	/* Start up XML output */
	xmlwriter = xmlNewTextWriter(xmlOutputBufferCreateFile(stdout,
		xmlGetCharEncodingHandler(XML_CHAR_ENCODING_UTF8)));
	xmlTextWriterStartDocument(xmlwriter, 0, 0, 0);
	xmlTextWriterStartElement(xmlwriter, XMLLIT "mwcats");
	in_page = false;

	return true;
}

/* True in that we don't generate pages and revisions. */
bool generatesXMLOutput(void) { return false; }

static void endPage(void) {
	if(in_page) {
		xmlTextWriterEndElement(xmlwriter); /* </page> */
		in_page = false;
	}
}

bool processPage(const mw_page* page) {
	fprintf(stderr, "\t%s\n", page->title.chars);
	/* Reset revision counters */
	revisioncounter = 1;
	goodrevisions = 0;
	/* Write page */
	endPage();
	xmlTextWriterStartElement(xmlwriter, XMLLIT "page");
	xmlTextWriterWriteAttribute(xmlwriter, XMLLIT "title", page->title.chars);
	in_page = true;
	return true;
}

#define PARSPREVOK  (goodrevisions   >= 2)
#define PREVPREVOK  (revisioncounter >= 3)
void processRevision(const mw_page* page, const mw_revision* revision) {
	size_t ch, len;
	changes changed;
	changed.abuse = false;

	// Noisy, but useful for seeing just how slow string distancing is being
	//fprintf(stderr, ".");

	/* Copy in and strip the current text (in one go, for efficiency) */
	/*	Find first non-space character */
	for(ch = 0; (ch <= revision->text.len) &&
	            isspace(revision->text.chars[ch]); ch++) {}
	/*	Copy from there, including the NULL */
	len = revision->text.len + 1 - ch;
	if(upstr_expand(&textbuf[textbuf_now], len)) {
		memcpy(textbuf[textbuf_now].chars, revision->text.chars + ch,
			revision->text.len + 1 - ch);
		textbuf[textbuf_now].len = len;
	} else { /* Copy as much as we can, then */
		memcpy(textbuf[textbuf_now].chars, revision->text.chars + ch,
			textbuf[textbuf_now].alloc - 1);
		textbuf[textbuf_now].len = textbuf[textbuf_now].alloc - 1;
		textbuf[textbuf_now].chars[textbuf[textbuf_now].len] = '\0';
	}
	/*	Reduce the string length while still encountering WSP */
	for(; (textbuf[textbuf_now].len > 0) &&
	      isspace(textbuf[textbuf_now].chars[textbuf[textbuf_now].len - 1]);
		textbuf[textbuf_now].len--) {}
	/*	Rewrite the NULL */
	textbuf[textbuf_now].chars[textbuf[textbuf_now].len] = '\0';

	/* Blanking? */
	if(textbuf[textbuf_now].len == 0) {
		changed.abuse = true;
		changed.cat.abu.type = ABUSE_BLANKING;
	}

	/* Revert?
	 * It's not computationally feasible to see if this is identical to any
	 * of the whole set of past versions, so only check the one prior to
	 * previous. To supplement this, look for 'revert' in the edit comment.
	 * (Unfortunately, reverts are not semantic in MediaWiki. =/ ) */
	if(PREVPREVOK && upstr_eq(&textbuf[textbuf_now], &textbuf[textbuf_prevprev])) {
		changed.abuse = true;
		changed.cat.abu.type = ABUSE_REVERT;
	}

	/* Ideally, should attempt to identify spam (new external links?) and
	 * profanity (not as simple as word list---WP has articles on
	 * swearwords); but if possible to do so reliably automatically, would
	 * not actually be a problem on the wiki... */

	/* If not vandalism to ignore... */
	if(!changed.abuse) {
		/* Parse the markup (roughly) and count as good */
		mw_stripMarkupRoughly(&textbuf[textbuf_now],
			&parsedtextbuf[parsed_now], &parsedmarkupbuf[parsed_now]);
		if(goodrevisions < 255) { goodrevisions++; }

		if(PARSPREVOK) {
			/* Major content change? */
			changed.cat.norm.diffmag =
				//upstr_levenshtein(
				upstr_distance(
					&parsedtextbuf[parsed_now],
					&parsedtextbuf[parsed_prev]);
					/* &textbuf[textbuf_now],
					&textbuf[textbuf_prev]); */ // To generate lies for debugging
			/* Look at the change distance over the new length.
			 * Must ensure that len is nonzero, as parsing may have
			 * discovered no plaintext characters. */
			changed.cat.norm.majortext = revision->minor ? false :
				((((double) changed.cat.norm.diffmag) /
				  (parsedtextbuf[parsed_now].len == 0 ? 1 :
				   	parsedtextbuf[parsed_now].len))
				> major_thresh);
			/* Markup things changed? */
#define DETECTCHANGE(BAG) \
	changed.cat.norm.BAG = !mw_bagofstrings_eq( \
		parsedmarkupbuf[parsed_now ].BAG, \
		parsedmarkupbuf[parsed_prev].BAG)

			DETECTCHANGE(templates);
			DETECTCHANGE(categories);
			DETECTCHANGE(pagelinks);
			DETECTCHANGE(urllinks);
#undef DETECTCHANGE
		} else {
			/* This is the edit which created the first revision */
			changed.cat.norm.diffmag = parsedtextbuf[parsed_now].len;
			changed.cat.norm.majortext = true; /* ...by definition */
#define DETECTANY(BAG) \
	changed.cat.norm.BAG = \
		MD_Bag_Length(parsedmarkupbuf[parsed_now].BAG);

			DETECTANY(templates);
			DETECTANY(categories);
			DETECTANY(pagelinks);
			DETECTANY(urllinks);
#undef DETECTANY
		}

		/* Cycle buffers. */
		if(--parsed_now < 0) { parsed_now = 1; }
	}

	/* Store categorisation for later use, e.g. per-individual-user stats
	 * where running out of memory won't break everything else. Also then
	 * count unique users then to do edit<class>/[reg]user avg.. /Could/
	 * safely accumulate categories here, but may as well do that with the
	 * rest of the postprocessing, and avoid having to worry about overflows
	 * or such. */
	/* The output uses <change abuse="bool"> rather than <abuse|change>, as
	 * this is arguably easier to parse, and allows for the changes
	 * structure to have common fields. */
	/* WRITEBOOL uses 1/0 over true/false for simple conciseness. */
#define WRITEBOOL(NAME, VALUE) \
	xmlTextWriterWriteAttribute(xmlwriter, (NAME), \
		(VALUE) ? XMLLIT "1" : XMLLIT "0")

	xmlTextWriterStartElement(xmlwriter, XMLLIT "change");
	WRITEBOOL(XMLLIT "abuse", changed.abuse);
	if(changed.abuse) {
		xmlTextWriterWriteAttribute(xmlwriter, XMLLIT "abusetype",
			XMLLIT abuse_names[changed.cat.abu.type]);
	} else {
		WRITEBOOL(XMLLIT "majortext",  changed.cat.norm.majortext);
		WRITEBOOL(XMLLIT "templates",  changed.cat.norm.templates);
		WRITEBOOL(XMLLIT "categories", changed.cat.norm.categories);
		WRITEBOOL(XMLLIT "pagelinks",  changed.cat.norm.pagelinks);
		WRITEBOOL(XMLLIT "urllinks",   changed.cat.norm.urllinks);
		xmlTextWriterStartAttribute(xmlwriter, XMLLIT "diffmag");
		xmlTextWriterWriteFormatString(xmlwriter, "%zd",
			changed.cat.norm.diffmag);
		xmlTextWriterEndAttribute(xmlwriter); /* diffmag="" */
		xmlTextWriterStartAttribute(xmlwriter, XMLLIT "newsize");
		xmlTextWriterWriteFormatString(xmlwriter, "%zd",
			parsedtextbuf[parsed_now].len);
		xmlTextWriterEndAttribute(xmlwriter); /* newsize="" */
	}
	/* Record who made this change */
	WRITEBOOL(XMLLIT "byreg", revision->contributor.registered);
	xmlTextWriterWriteAttribute(xmlwriter, XMLLIT "byid",
		revision->contributor.id.chars);
#ifdef DEBUG_REVNO
	xmlTextWriterStartAttribute(xmlwriter, XMLLIT "revno");
	xmlTextWriterWriteFormatString(xmlwriter, "%u",
		(unsigned int) revisioncounter);
	xmlTextWriterEndAttribute(xmlwriter); /* revno="" */
#endif
	xmlTextWriterEndElement(xmlwriter); /* </change> */
#undef WRITEBOOL

	/* Cycle the buffers. This will build up old versions at (cyclically)
	 * greater indicies. Also count toward buffer validity. */
	if(--textbuf_now < 0) { textbuf_now = 2; }
	if(revisioncounter < 255) { revisioncounter++; }
}
#undef PREVPREVOK
#undef PARSPREVOK

void killFilter(void) {
	endPage();
	xmlTextWriterEndElement(xmlwriter); /* </mwcats> */
	xmlTextWriterFlush(xmlwriter);
	xmlFreeTextWriter(xmlwriter);
	/* Provide a final newline */
	fflush(stdout);
	printf("\n");
}

