/* These tools are Free Software, licensed under the MIT license. *
 *  Copyright 2007, Philip Boulain. See LICENSE.TXT for details.  */
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <stdint.h>
#include <libxml/xmlwriter.h>
#include "mediawikiformat.h" /* Provides libxml headers */

////  PARSER  //////////////////////////////////////////////////////////////////

// Despite xmlSAX2InitDefaultSAXHandler(), this WILL cause comedy segaults
// if not static (=> zero-initialised). (In fact, that init appears to try
// to build trees or somesuch, and causes much brokenness.)
static xmlSAXHandler shandler;

typedef enum {S_START, S_MEDIAWIKI, S_PAGE, S_TITLE, S_REVISION, S_CONTRIBUTOR, S_CONTRIB_IP, S_CONTRIB_ID, S_MINOR, S_TEXT, S_UNKNOWN} pstate;
typedef struct {
	pstate state;
	pstate prev_state;
	uint32_t unknown_depth;

	bool parse_revs; // True if revisions should be parsed; else ignored

	mw_page page; // .title
	mw_revision revision; // .contributor.{registered,id}, .minor, .text
} pdata;
pdata parser_data;

static void newPage(pdata* par) {
	par->page.title.len = 0;
	par->parse_revs = false; // Ignore revisions before processPage() called
}

static void newRevision(pdata* par) {
	par->revision.contributor.registered = false;
	par->revision.contributor.id.len = 0;
	par->revision.minor = false;
	par->revision.text.len = 0;
}

static void p_startDocument(void* user_data) {
	pdata* par = (pdata*) user_data;
	par->state = S_START;

	upstr_init(&par->page.title);
	upstr_init(&par->revision.contributor.id);
	upstr_init(&par->revision.text);
}

static void p_endDocument(void* user_data) {
	pdata* par = (pdata*) user_data;
	// Free character buffers
	fprintf(stderr, "Character buffer final sizes (bytes):\n"
		"\ttitle = %zd\n\thost  = %zd\n\ttext  = %zd\n",
		par->page.title.alloc,
		par->revision.contributor.id.alloc,
		par->revision.text.alloc);
	upstr_destroy(&par->page.title);
	upstr_destroy(&par->revision.contributor.id);
	upstr_destroy(&par->revision.text);
}

static void p_characters(void* user_data, const xmlChar* ch, int len) {
	pdata* par = (pdata*) user_data;
	// Spew to stdout as debug
	//for(int i = 0; i < len; i++) { putchar(ch[i]); }

	// Record text appropriate if in title, IP, or text states.
	// Else, ignore it.
	switch(par->state) {
		case S_TITLE:
			upstr_append(&par->page.title, ch, len);
			break;
		case S_CONTRIB_ID: // Both accumulate to same buffer
		case S_CONTRIB_IP:
			upstr_append(&par->revision.contributor.id, ch, len);
			break;
		case S_TEXT:
			upstr_append(&par->revision.text, ch, len);
			break;
		case S_START:
		case S_MEDIAWIKI:
		case S_PAGE:
		case S_REVISION:
		case S_CONTRIBUTOR:
		case S_MINOR:
		case S_UNKNOWN:
			break;
	}
}

static void p_startElement(void* user_data, const xmlChar* name, const xmlChar** attrs) {
	pdata* par = (pdata*) user_data;
	bool unknown = false;

	// _Strictly_, should use g_utf8_collate(), not strcmp(), but shouldn't
	// matter here as the literals compared against are all 7-bit.
#define IFELEM(x) if(!strcmp((const char *) name, (x)))

	// State machine.
	switch(par->state) {
		case S_START:
			IFELEM("mediawiki") {
				par->state = S_MEDIAWIKI;
			} else {
				fprintf(stderr, "Erm, the root element is '%s', not 'mediawiki'...\n", name);
				unknown = true;
			}
			break;
		case S_MEDIAWIKI:
			IFELEM("page") {
				par->state = S_PAGE;
				newPage(par); // Entering new page; prepare for it
			} else { unknown = true; }
			break;
		case S_PAGE:
			// Easy, common tests first.
			IFELEM("revision") {
				// If parse_revs is false, treat revisions as unknown.
				if(par->parse_revs) {
					par->state = S_REVISION;
					// Entering new revision; prepare for it
					newRevision(par);
				} else {
					unknown = true;
				}
			} else IFELEM("title") {
				par->state = S_TITLE;
			} else { unknown = true; }
			break;
		case S_REVISION:
			       IFELEM("text") {
				par->state = S_TEXT;
			} else IFELEM("contributor") {
				par->state = S_CONTRIBUTOR;
			} else IFELEM("minor") {
				par->state = S_MINOR;
				// Mere presence of element indicates such
				par->revision.minor = true;
			} else { unknown = true; }
			break;
		case S_CONTRIBUTOR:
			IFELEM("ip") {
				par->state = S_CONTRIB_IP;
				par->revision.contributor.registered = false;
			} else IFELEM("id") {
				par->state = S_CONTRIB_ID;
				par->revision.contributor.registered = true;
			} else { unknown = true; }
			break;
		case S_TITLE:
		case S_CONTRIB_IP:
		case S_CONTRIB_ID:
		case S_MINOR:
		case S_TEXT:
		case S_UNKNOWN:
			unknown = true; // Not expecting subelements
			break;
	}

#undef IFELEM

	// Handle the case where we just got thrown an element we don't
	// understand. The behaviour is to ignore it, and all subelements, until
	// we "resurface" back into known territory.
	if(unknown) {
		if(par->state == S_UNKNOWN) {
			par->unknown_depth++;
		} else {
			par->prev_state = par->state;
			par->state = S_UNKNOWN;
			par->unknown_depth = 1;
		}
	}
}

static void p_endElement(void* user_data, const xmlChar* name) {
	pdata* par = (pdata*) user_data;

	switch(par->state) {
		case S_START:
			fprintf(stderr, "Completely unexpected end of '%s' while within start state.\n", name);
			break;
		case S_MEDIAWIKI:
			par->state = S_START; // No real need for FINISH
			break;
		case S_PAGE:
			par->state = S_MEDIAWIKI;
			break;
		case S_TITLE:
			par->state = S_PAGE;
			// This is the only element needed to get all the page
			// data, so trigger the page handler. IMPORTANT! If
			// mw_page grows, increase cleverness will be required
			// to ensure that ALL page data is accumulated before
			// this happens.
			// End of page element is TOO LATE to use processPage's
			// return value!
			par->parse_revs = processPage(&par->page);
			break;
		case S_REVISION:
			par->state = S_PAGE;
			// Now that revision is complete, throw it at handler.
			processRevision(&par->page, &par->revision);
			break;
		case S_CONTRIBUTOR:
			par->state = S_REVISION;
			break; // Work done by subelements
		case S_CONTRIB_ID: // Same parent state
		case S_CONTRIB_IP:
			par->state = S_CONTRIBUTOR;
			break; // Accumulates directly
		case S_MINOR:
			par->state = S_REVISION;
			break; // Work done at start of element
		case S_TEXT:
			par->state = S_REVISION;
			break; // Accumulates directly
		case S_UNKNOWN:
			// Less unknown; if now zero unknown, return to known
			if(!--par->unknown_depth)
				{ par->state = par->prev_state; }
			break;
	}
}

static xmlEntityPtr p_getEntity(void* user_data, const xmlChar* name) {
	return xmlGetPredefinedEntity(name);
}

static void p_problem(const char* severity, const char* msg, va_list args) {
	fprintf(stderr, "LibXML %s: ", severity);
	vfprintf(stderr, msg, args);
}

static void p_warning(void* user_data, const char* msg, ...) {
	va_list args; va_start(args, msg);
	p_problem("warning", msg, args); va_end(args); }
static void p_error(void* user_data, const char* msg, ...) {
	va_list args; va_start(args, msg);
	p_problem("error  ", msg, args); va_end(args); }
static void p_fatalError(void* user_data, const char* msg, ...) {
	va_list args; va_start(args, msg);
	p_problem("-FATAL-", msg, args); va_end(args); }

////  WRITER  //////////////////////////////////////////////////////////////////

// Cast from C string literal to xmlChar
#define XMLLIT (const xmlChar*)

typedef struct {
	bool written;
	const char* outfile;
	xmlTextWriterPtr xmlwriter;

	bool in_page;
} wdata;
wdata writer_data;

static void writerUsed(void) {
	if(!writer_data.written) {
		writer_data.written = true;
		// Initialise for real
		writer_data.xmlwriter =
			xmlNewTextWriter(xmlOutputBufferCreateFilename(
				writer_data.outfile,
				xmlGetCharEncodingHandler(XML_CHAR_ENCODING_UTF8),
				0));
		writer_data.in_page = false;
		// Write the preamble
		// This is hardcoded and lossy, but version 0.3 /is/ what we
		// output. The siteinfo is completely culled.
		xmlTextWriterStartDocument(writer_data.xmlwriter, 0, 0, 0);
		xmlTextWriterStartElement(writer_data.xmlwriter,
			XMLLIT "mediawiki");
		xmlTextWriterWriteAttribute(writer_data.xmlwriter,
			XMLLIT "xmlns",
			XMLLIT "http://www.mediawiki.org/xml/export-0.3/");
		xmlTextWriterWriteAttribute(writer_data.xmlwriter,
			XMLLIT "version",
			XMLLIT "0.3");
	}
}

static void writerInit(const char* outfile) {
	writer_data.written = false; // Know that we're not truly initalised (lazy)
	writer_data.outfile = outfile; // Store where to write to if we have to
	if(generatesXMLOutput()) {
		// Lazy init was a nice idea, but writePage() may never be
		// called if there are no pages to output---this should still
		// overwrite the output file with preamble-only XML file.
		// WARNING: writePage/Revision() no longer do this automtically,
		// for performance reasons. If generatesXMLOutput() isn't tested
		// here, in advance, they may try to run in an improperly
		// initialised state, and everything will go horribly wrong.
		writerUsed();
	}
}

static void writePageClose(void) {
	xmlTextWriterEndElement(writer_data.xmlwriter); // </page>
}

void writePage(const mw_page* page) { // Strictly, only its opening
//	writerUsed();
	if(writer_data.in_page) { writePageClose(); }
	xmlTextWriterStartElement(writer_data.xmlwriter, XMLLIT "page");
	writer_data.in_page = true;
	xmlTextWriterWriteElement(writer_data.xmlwriter, XMLLIT "title",
		page->title.chars);
	// id lost
}

void writeRevision(const mw_revision* revision) { // Whole thing
//	writerUsed();
	if(!writer_data.in_page) {
		fprintf(stderr, "writeRevision() before writePage()! Output likely invalid.\n");
	}
	xmlTextWriterStartElement(writer_data.xmlwriter, XMLLIT "revision");
	xmlTextWriterStartElement(writer_data.xmlwriter, XMLLIT "contributor");
	xmlTextWriterWriteElement(writer_data.xmlwriter,
		revision->contributor.registered ? XMLLIT "id" : XMLLIT "ip",
		revision->contributor.id.chars);
	xmlTextWriterEndElement(writer_data.xmlwriter); // </contributor>
	if(revision->minor) {
		xmlTextWriterStartElement(writer_data.xmlwriter,
			XMLLIT "minor");
		xmlTextWriterEndElement(writer_data.xmlwriter); // </minor>
	}
	xmlTextWriterStartElement(writer_data.xmlwriter, XMLLIT "text");
	xmlTextWriterWriteAttribute(writer_data.xmlwriter, XMLLIT "xml:space",
		XMLLIT "preserve");
	xmlTextWriterWriteString(writer_data.xmlwriter, revision->text.chars);
	xmlTextWriterEndElement(writer_data.xmlwriter); // </text>
	xmlTextWriterEndElement(writer_data.xmlwriter); // </revision>
	// id, timestamp, contributor.username, and comment lost
}

static void writerShutdown(void) {
	if(!writer_data.written) { return; } // Never got used
	// Write the postamble
	if(writer_data.in_page) { writePageClose(); }
	xmlTextWriterEndElement(writer_data.xmlwriter); // </mediawiki>
	// As far as I can tell, this is the only way to 'close' one, which
	// should then free the output buffer as well.
	xmlTextWriterFlush(writer_data.xmlwriter); // Paranoia
	xmlFreeTextWriter(writer_data.xmlwriter);
}

#undef XMLLIT

////  MAIN  ////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {
	int error;
	int eaten = 2; // Delicious arguments (binary; input.xml)
	char* filename = "-";
	char* outfile = "-";

	shandler.startDocument = p_startDocument;
	shandler.endDocument   = p_endDocument;
	shandler.characters    = p_characters;
	shandler.startElement  = p_startElement;
	shandler.endElement    = p_endElement;
	shandler.getEntity     = p_getEntity;
	shandler.warning       = p_warning;
	shandler.error         = p_error;
	shandler.fatalError    = p_fatalError;

	if(argc >= 2) { filename = argv[1]; }
	if(argc >= 3) { outfile  = argv[2]; }
	if(argc >= 2 && !strcmp(argv[1], "--help")) {
		fprintf(stderr,	"Usage: %s [input.xml %s[<filter specific>]%s]\n"
				"See filter documentation for details.\n",
			argv[0],
			generatesXMLOutput() ? "[output.xml " : "",
			generatesXMLOutput() ? "]" : "");
		return 1;
	}

	if(generatesXMLOutput()) { eaten++; } // output.xml
	argc -= eaten; if(argc < 0) { argc = 0; }
	if(!initFilter(argc, argv + eaten)) {
		fprintf(stderr, "Invalid arguments; try --help, or the filter documentation.\n");
		return 1;
	}

	writerInit(outfile);

	if((error = xmlSAXUserParseFile(&shandler, &parser_data, filename))) {
		fprintf(stderr, "libxml2 returned error %d\n", error);

		return error;
	}

	killFilter();
	writerShutdown();

	return 0;
}

