/* These tools are Free Software, licensed under the MIT license. *
 *  Copyright 2007, Philip Boulain. See LICENSE.TXT for details.  */

#include <ctype.h>
#include <string.h>
#include "mediawikimarkup.h"

/* Cast from C string literal to xmlChar */
#define XMLLIT (const xmlChar*)

bool mw_markupinfo_init(mw_markupinfo* info) {
	if(!(info->templates = MD_Bag_New())) {
		return false; }
	if(!(info->categories = MD_Bag_New())) {
		MD_Bag_Destroy(info->templates);
		return false; }
	if(!(info->pagelinks = MD_Bag_New())) {
		MD_Bag_Destroy(info->templates);
		MD_Bag_Destroy(info->categories);
		return false; }
	if(!(info->urllinks = MD_Bag_New())) {
		MD_Bag_Destroy(info->templates);
		MD_Bag_Destroy(info->categories);
		MD_Bag_Destroy(info->pagelinks);
		return false; }
	return true;
}

static void mw_markupinfo_clean_single(MD_Bag* bag) {
	unsigned int i;
	upstr* str;

	MD_BAG_FOREACH(bag, i, str)
		upstr_destroy(str);
		free(str);
	MD_BAG_FOREACH_DONE()
	MD_Bag_Clear(bag);
}

void mw_markupinfo_clean(mw_markupinfo* info) {
	mw_markupinfo_clean_single(info->templates);
	mw_markupinfo_clean_single(info->categories);
	mw_markupinfo_clean_single(info->pagelinks);
	mw_markupinfo_clean_single(info->urllinks);
}

void mw_markupinfo_destroy(mw_markupinfo* info) {
	mw_markupinfo_clean(info); /* Else will leak bag contents */
	MD_Bag_Destroy(info->templates);
	MD_Bag_Destroy(info->categories);
	MD_Bag_Destroy(info->pagelinks);
	MD_Bag_Destroy(info->urllinks);
}

/* Record an (inclusive, but possibly invalid) substring */
static void record_to_bag(MD_Bag* bag,
	const xmlChar* string, size_t start, size_t end) {

	if(end >= start) {
		const size_t len = (end - start) + 1;
		upstr* newstr = malloc(sizeof(upstr));
		if(newstr) {
			upstr_init(newstr);
			if(upstr_expand(newstr, len + 1)) {
				memcpy(newstr->chars, string + start, len);
				newstr->chars[len] = '\0';
				newstr->len = len;
				MD_Bag_Add(bag, newstr);
			} else {
				free(newstr); /* Fail :( */
			}
		}
	}
}

/* Things to do to approximately parse:
 * - Remove sequences of two, three, or five single quotes (italic/bold)
 * - Convert [[target]] and [[target|display text]] to just 'display text' (or
 *   'target') and record an outgoing wiki link to 'target'. May contain
 *   multiple pipes, in which case use first and last parts. This is greatly
 *   simplified from the truly grotesque conditions here:
 *   http://en.wikipedia.org/wiki/Help:Link
 *   http://en.wikipedia.org/wiki/Help:Piped_link
 *   (Don't even try to implement the likes of [[wp:de:../foo (bar)|]].)
 *   - UNLESS the first characters of 'target' are 'Category:', in which case
 *     replace the next with blank (ignore any piped args.) and record a
 *     categorisation.
 * - Remove '#REDIRECT' (followed by a regular link)
 * - Convert [url] and [url display text] to just 'display text' (or '1') and
 *   record an outgoing URL link to 'url' (plaintext unchanged).
 * - Identify 'http://' until the next whitespace as an URL link.
 * - Remove sequences of between two and six equals signs (headings)
 * - Remove '*' and '#' characters (unordered and numbered lists)
 *   if at start of line (unimplemented!)
 * - Remove ';' and ':' characters (definition lists; indenting)
 *   - Does neither of these: greater use as English punctuation
 * - Remove sequences of four or more hypen-minuses (horizontal rules)
 * - Inside a table, bound by '{|' and '|}', ignore all of '!|-+'.
 *   Also ignore everything between the opening '|' and the first '!'.
 *   (Actual parsing rules are incredibly hairy:)
 *   http://en.wikipedia.org/wiki/Help:Table
 * - Convert {{target}} to blank, and record template usage. This treats (piped)
 *   template arguments as part of its name, such that argument changes show up
 *   as template changes. (Some 'templates' might actually be variables.)
 *   - UNLESS the first character of the target is '#', in which case it's
 *     probably a processor directive, so just reduce it to nothing.
 * - Remove anything of the form '__X__', where X is some sequence of uppercase
 *   letters ("magic words").
 * - Strip leading space on lines. (Pre-esque formatting, but with markup.)
 * - Remove anything which looks like an HTML element =(   List at:
 *   http://en.wikipedia.org/wiki/Help:HTML_in_wikitext
 *   This ignores <nowiki> and <pre>'s markup-disabling effects;
 *   treats <math> contents as content (arguably correct),
 *   and some other elements' markup instructions as content (e.g. <gallery>,
 *   <timeline>).
 * - Remove HTML comments (<!-- ... -->) (will have been escaped past libxml).
 *
 * Not handled:
 * - Signatures. These are computed as the page is saved, so don't appear.
 * - A massive number of little special cases.
 * - Correctly recursive application. Markup inside tables is passed through.
 * 
 * Other limitations:
 * - Odd use of balanced markup may be understood here, but not with MediaWiki's
 *   own "parser"; e.g. [[i-am-a-node-link]i-am-an-url-link]
 */
typedef enum {
	MWS_STARTLINE, MWS_NORMAL, MWS_QUOTING, MWS_OVERQUOTING,
	MWS_BRACKETED, MWS_DBLBRACKETED, MWS_DBLBRACKETEDPIPED,
		MWS_BRACKETEDURL, MWS_BRACKETEDURLTEXT,
	MWS_HTTP1, MWS_HTTP2, MWS_HTTP3, MWS_HTTP4, MWS_HTTP5, MWS_HTTP6,
		MWS_URLLINK,
	MWS_REDIR1, MWS_REDIR2, MWS_REDIR3, MWS_REDIR4, MWS_REDIR5, MWS_REDIR6,
		MWS_REDIR7, MWS_REDIR8, MWS_REDIRECT,
	MWS_EQUALS, MWS_DBLEQUALS, MWS_OVEREQUALS,
	MWS_HYPHEN, MWS_DBLHYPHEN, MWS_TPLHYPHEN, MWS_OVERHYPHEN,
	MWS_SGML, MWS_SOMEMAGIC, MWS_MAGIC, MWS_LESSMAGIC,
	MWS_BRACED, MWS_DBLBRACED, MWS_BRACEDIRECTIVE, MWS_TRANSCLUDE,
		MWS_TABLE, MWS_TABLEBODY
} mw_state;
void mw_stripMarkupRoughly(const upstr* restrict wikitext, upstr* restrict plaintext, mw_markupinfo* info) {
	/* Various 'parser' state (including that of the state machine). */
	mw_state state = MWS_STARTLINE;
	size_t start = 0; /* Index into the string where a state started, if it
			     may need to later output skipped characters */

	/* Our arguments are initialised, but they might be dirty. */
	upstr_set(plaintext, "");
	mw_markupinfo_clean(info);

	/* Hokay. Let's seek through this thing, because we're not able to parse
	 * it /properly/, with a /grammar/ or anything.
	 *
	 * I hate this. :(
	 */
	for(size_t cursor = 0; cursor < wikitext->len; cursor++) {
		xmlChar* curschar = &wikitext->chars[cursor];

		/* Transitioning macro. Can be followed by an explicit else
		 * case which continues; otherwise, will reach the bottom of the
		 * for loop and output the character. Transitions eat characters.
		 */
#define TRANSITION(UPON,TO) \
	if(*curschar == (UPON)) { state = (TO); continue; }

		/* Non-consuming transition for recognising character sequences;
		 * implements an else case by dropping to normal state and
		 * reprocessing. */
#define TRANSITIONSEQ(NEXT,TO) \
	if(*curschar == (NEXT)) { state = (TO); } \
	else { state = MWS_NORMAL; goto reprocess; }

		/* General else case for repeating a partially-matched pattern,
		 * including dropping to normal and reprocessing. Useful if
		 * the matched patten must not be output. */
#define PARTIALELSE(PARTIAL,LENGTH) \
	else { upstr_append(plaintext, (PARTIAL), LENGTH); \
		state = MWS_NORMAL; goto reprocess; }

		/* Say hello to one of those cases where 'goto' is the most
		 * elegant solution. This label can be used to reprocess a
		 * character after a state change, avoiding output and the
		 * for loop's increment. */
reprocess:

		if(*curschar == '\r' || *curschar == '\n') {
			/* Newlines override any but a few states */
			if(state != MWS_TABLE
			&& state != MWS_TABLEBODY) {
				state = MWS_STARTLINE;
				/* Output and continue manually, to skip the
				 * switch for this character. */
				upstr_append(plaintext, curschar, 1);
				continue;
			}
		}

		switch(state) {
		case MWS_STARTLINE:
			if(isspace(*curschar)) { continue; }
			state = MWS_NORMAL;
			goto reprocess;
			break;
		case MWS_NORMAL:
			     TRANSITION('\'', MWS_QUOTING)
			else TRANSITION('[', MWS_BRACKETED)
			else TRANSITION('#', MWS_REDIR1)
			else TRANSITION('=', MWS_EQUALS)
			else TRANSITION('-', MWS_HYPHEN)
			else TRANSITION('<', MWS_SGML)
			else TRANSITION('{', MWS_BRACED)
			else   if(*curschar == '_') {
				start = cursor;
				state = MWS_SOMEMAGIC;
				continue; /* nibble */
			} else if(*curschar == 'h') { /* Start recording URL */
				start = cursor;
				state = MWS_HTTP1; /* plaintext passthrough */
			}
			/* else output the character */
			break;
		case MWS_QUOTING: TRANSITION('\'', MWS_OVERQUOTING)
			PARTIALELSE(XMLLIT "'", 1) break;
		case MWS_OVERQUOTING:
			if(*curschar != '\'') {
				state = MWS_NORMAL;
				goto reprocess;
			} else { continue; } /* Quietly eat quotes */
			break;
		case MWS_BRACKETED:
			TRANSITION(']', MWS_NORMAL)
			else if(*curschar == '[') { /* Record name from next */
				state = MWS_DBLBRACKETED;
				start = cursor + 1;
				continue;
			} else { /* Start recording URL from here */
				state = MWS_BRACKETEDURL;
				start = cursor;
				continue;
			}
			break;
		case MWS_DBLBRACKETED: { /* give this a block for vars */
			/* Memo: [[foo]]->foo; [[foo|bar|baz]]->baz */
			size_t end = 0; /* init is purely to appease gcc */
			bool iscategory = false;
			if(*curschar == '|' || *curschar == ']') {
				/* If it starts with "Category:", it
				 * belongs in the 'categories' bag. */
				end = cursor - 1;
				if(end >= (start + 8)) {
					if(!memcmp(wikitext->chars + start,
						"Category:", 9))
						{ iscategory = true; }
				}
				record_to_bag(
					iscategory ? info->categories :
					             info->pagelinks,
					wikitext->chars, start, end);
			}

			       if(*curschar == '|') {
				/* Record replacement link text */
				start = cursor + 1;
				state = MWS_DBLBRACKETEDPIPED;
				continue;
			} else if(*curschar == ']') {
				/* Output the target as text */
				if(!iscategory && end >= start) {
					upstr_append(plaintext,
						wikitext->chars + start,
						(end - start) + 1);
				}
				state = MWS_BRACKETED;
				continue;
			} else { continue; } /* IT AM DELISHUS */
			break; } /* note end of block */
		case MWS_DBLBRACKETEDPIPED:
			       if(*curschar == '|') {
				/* Oh. /Next/ must be link text...? */
				start = cursor + 1;
				continue;
			} else if(*curschar == ']') {
				/* Aha! We've got the link text */
				const size_t end = cursor - 1;
				if(end >= start) {
					upstr_append(plaintext,
						wikitext->chars + start,
						(end - start) + 1);
				}
				state = MWS_BRACKETED;
				continue;
			} else { continue; } /* nyam, nyam, nyam */
			break;
		case MWS_BRACKETEDURL: { /* scope again */
			bool space = isspace(*curschar);
			if(space || *curschar == ']') { /* i has a link */
				const size_t end = cursor - 1;
				if(end >= start) {
					record_to_bag(info->urllinks,
						wikitext->chars, start, end);
				}
			}

			if(space) { /* Switch to echoing link text */
				state = MWS_BRACKETEDURLTEXT;
				continue;
			} else if(*curschar == ']') {
				/* Got an URL with no label; fake counter */
				upstr_append(plaintext, XMLLIT "1", 1);
				state = MWS_NORMAL;
				continue;
			} else { continue; } /* *burp* */
			break; }
		case MWS_BRACKETEDURLTEXT:
			TRANSITION(']', MWS_NORMAL) /* echo anything else */
			break;
		case MWS_HTTP1: TRANSITIONSEQ('t', MWS_HTTP2) break;
		case MWS_HTTP2: TRANSITIONSEQ('t', MWS_HTTP3) break;
		case MWS_HTTP3: TRANSITIONSEQ('p', MWS_HTTP4) break;
		case MWS_HTTP4: TRANSITIONSEQ(':', MWS_HTTP5) break;
		case MWS_HTTP5: TRANSITIONSEQ('/', MWS_HTTP6) break;
		case MWS_HTTP6: TRANSITIONSEQ('/', MWS_URLLINK) break;
		case MWS_URLLINK:
			if(isspace(*curschar)) {
				const size_t end = cursor - 1;
				if(end > start) {
					record_to_bag(info->urllinks,
						wikitext->chars, start, end);
				}
				state = MWS_NORMAL;
			} /* cannot has cheeseburger */
			break;
		case MWS_REDIR1: TRANSITION('R', MWS_REDIR2)
			PARTIALELSE(XMLLIT "#", 1) break;
		case MWS_REDIR2: TRANSITION('E', MWS_REDIR3)
			PARTIALELSE(XMLLIT "#R", 2) break;
		case MWS_REDIR3: TRANSITION('D', MWS_REDIR4)
			PARTIALELSE(XMLLIT "#RE", 3) break;
		case MWS_REDIR4: TRANSITION('I', MWS_REDIR5)
			PARTIALELSE(XMLLIT "#RED", 4) break;
		case MWS_REDIR5: TRANSITION('R', MWS_REDIR6)
			PARTIALELSE(XMLLIT "#REDI", 5) break;
		case MWS_REDIR6: TRANSITION('E', MWS_REDIR7)
			PARTIALELSE(XMLLIT "#REDIR", 6) break;
		case MWS_REDIR7: TRANSITION('C', MWS_REDIR8)
			PARTIALELSE(XMLLIT "#REDIRE", 7) break;
		case MWS_REDIR8: TRANSITION('T', MWS_REDIRECT)
			PARTIALELSE(XMLLIT "#REDIREC", 8) break;
		case MWS_REDIRECT:
			/* Two choices:
			 *  - Black hole state. Stay here, feasting on
			 *    characters, until the global STARTLINE transition.
			continue;
			 *  - Reset to NORMAL immediately, basically only
			 *    eating the '#REDIRECT '. It is normally followed
			 *    by a [[page link]], so this is what we do. */
			state = MWS_NORMAL;
			continue; /* Don't reprocess; eat the space */
			break;
		case MWS_EQUALS: TRANSITION('=', MWS_DBLEQUALS)
			PARTIALELSE(XMLLIT "=", 1) break;
		case MWS_DBLEQUALS: TRANSITION('=', MWS_OVEREQUALS)
			PARTIALELSE(XMLLIT "==", 2) break;
		case MWS_OVEREQUALS:
			if(*curschar != '=') {
				state = MWS_NORMAL;
				goto reprocess;
			}
			break;
		case MWS_HYPHEN: TRANSITION('-', MWS_DBLHYPHEN)
			PARTIALELSE(XMLLIT "-", 1) break;
		case MWS_DBLHYPHEN: TRANSITION('-', MWS_TPLHYPHEN)
			PARTIALELSE(XMLLIT "--", 2) break;
		case MWS_TPLHYPHEN: TRANSITION('-', MWS_OVERHYPHEN)
			PARTIALELSE(XMLLIT "---", 3) break;
		case MWS_OVERHYPHEN:
			if(*curschar != '-') {
				state = MWS_NORMAL;
				goto reprocess;
			}
			break;
		case MWS_SGML:
			TRANSITION('>', MWS_NORMAL)
			else { continue; } /* Tasty elements */
			break;
		case MWS_SOMEMAGIC: TRANSITION('_', MWS_MAGIC)
			PARTIALELSE(XMLLIT "_", 1) break;
		case MWS_MAGIC:
			TRANSITION('_', MWS_LESSMAGIC)
			else if(isspace(*curschar)) {
				/* Oh...wasn't magic, then */
				const size_t end = cursor - 1;
				if(end > start) {
					upstr_append(plaintext,
						wikitext->chars + start,
						(end - start) + 1);
				}
				state = MWS_NORMAL;
			} else { continue; } /* Spicy magic */
			break;
		case MWS_LESSMAGIC:
			TRANSITION('_', MWS_NORMAL)
			else { state = MWS_MAGIC; continue; }
			break;
		case MWS_BRACED:
			     TRANSITION('{', MWS_DBLBRACED)
			else TRANSITION('|', MWS_TABLE)
			else TRANSITION('}', MWS_NORMAL)
			else { /* Oh. Maybe we had a { on its own? */
				upstr_append(plaintext, XMLLIT "{", 1);
				state = MWS_NORMAL;
				goto reprocess;
			}
			break;
		case MWS_DBLBRACED:
			     TRANSITION('}', MWS_NORMAL)
			else TRANSITION('#', MWS_BRACEDIRECTIVE)
			else { /* Starting transclusion target (source?) */
				start = cursor;
				state = MWS_TRANSCLUDE;
				continue;
			}
			break;
		case MWS_BRACEDIRECTIVE:
			TRANSITION('}', MWS_BRACED)
			else { continue; } /* Not cheeze :( */
			break;
		case MWS_TRANSCLUDE:
			if(*curschar == '}') {
				const size_t end = cursor - 1;
				if(end > start) {
					record_to_bag(info->templates,
						wikitext->chars, start, end);
				}
				state = MWS_BRACED;
			}
			continue;
			break;
		case MWS_TABLE:
			     TRANSITION('}', MWS_NORMAL)
			else TRANSITION('!', MWS_TABLEBODY)
			else { continue; } /* Yuck, preamble */
			break;
		case MWS_TABLEBODY:
			TRANSITION('}', MWS_NORMAL)
			else { /* Table noise is crunchy */
				if(*curschar == '!'
				|| *curschar == '|'
				|| *curschar == '-'
				|| *curschar == '+') { continue; }
			}
			break;
		}

		/* If reached here (no 'continue'), use the character */
		upstr_append(plaintext, curschar, 1);

#undef TRANSITION
	}
	return;
}

bool mw_bagofstrings_eq(const MD_Bag* one, const MD_Bag* two) {
	unsigned int i;
	const upstr* strone;
	void** twocontents;
	
	if(MD_Bag_Length(one) != MD_Bag_Length(two)) { return false; }
	twocontents = MD_Bag_Contents(two);
	MD_BAG_FOREACH(one, i, strone)
		if(!upstr_eq(strone, twocontents[i])) { return false; }
	MD_BAG_FOREACH_DONE()
	return true;
}

