/* These tools are Free Software, licensed under the MIT license. *
 *  Copyright 2007, Philip Boulain. See LICENSE.TXT for details.  */
#include <stdio.h>
#include <string.h>
#include "upstr.h" /* Provides libxml headers */

// Noisy Levenshtein comparisons
// #define DEBUG_LEVENSHTEIN_MATRIX

// Noisy distance comparisons
// #define DEBUG_DISTANCE

//// Generate an appropriate trace function based upon DEBUG_DISTANCE
#ifdef DEBUG_DISTANCE
	#define DBGDIST(OUT...) fprintf(stderr, ## OUT)
#else
	static inline void __noop(const char* fmt, ...) {}
	#define DBGDIST __noop
#endif

////  EVERYONE LOVES A STRING LIBRARY  /////////////////////////////////////////
void upstr_init(upstr* str) {
	str->len = 0;
	// Attempt to generate an 'empty' string to keep C library calls happy
	str->chars = malloc(1);
	if(str->chars) {
		str->alloc = 1;
		*str->chars = '\0';
	} else {
		str->alloc = 0;
		fprintf(stderr, "WARNING! Couldn't allocate even a single byte!\n");
		// Great. Expect to explode in, printf, strtol, etc., then.
	}	
}

bool upstr_eq(const upstr* one, const upstr* two) {
	// Can avoid scanning through long strings only to find that one is slightly shorter
	return (one->len == two->len) ?
		!strcmp((const char*) one->chars, (const char*) two->chars) :
		false;
}

void upstr_set(upstr* str, const char* cstr) {
	size_t clen = strlen(cstr); // Lousy non-Pascal strings ;)
	upstr_destroy(str); // Free previous allocation
	str->len = clen; // Reinit len
	clen++; // Allow for the NULL
	if(upstr_expand(str, clen)) { // Get memory; reinit alloc
		memcpy(str->chars, cstr, clen); // Reinit chars
	} else { // _expand will have printed warning
		upstr_init(str); // Malloc failed, so is safe to re-init
		// Doing this keeps str valid to C library functions
	}
}

void upstr_setu(upstr* dest, const upstr* src) {
	if(upstr_expand(dest, src->len + 1)) {
		memcpy(dest->chars, src->chars, src->len + 1);
		dest->len = src->len;
	} /* else should probably report error */
}

bool upstr_expand(upstr* str, size_t required) {
	xmlChar* lump;
	size_t allocate = str->alloc;
	// Pick power of two >= required
	if(allocate >= required) { return true; } // Already enough allocated
	if(!allocate) { allocate = 1; } // Always allocate at lease one byte
	while(allocate < required) { allocate *= 2; } // Grow expotentially, for O(log N) time
	// Ok, now realloc this
	lump = realloc(str->chars, allocate);
	if(lump) {
		str->chars = lump;
		str->alloc = allocate;
		return true;
	} else { // realloc() guarantees old memory NOT freed; still OK
		fprintf(stderr, "Memory exhaustion; couldn't allocate %zd bytes for string.\n", allocate);
		return false;
	}
}

bool upstr_append(upstr* str, const xmlChar* ch, int len) {
	if(upstr_expand(str, str->len + len + 1)) { // Guarantee null past the end
		// Enough memory now, copy it in
		// Even though this is UTF-8, we're still dealing with chunks of bytes
		memcpy(str->chars + str->len, ch, len);
		str->len += len;
		str->chars[str->len] = '\0';
		return true;
	} else { return false; } // failed; insufficient memory
}

// Implementation of upstr_levenshtein, after the strings have been ordered such
// that string one is no longer than string two. We can then make the shorter
// string sit along the columns, and save the amount of memory used for the
// rows, in exchange for having to go through more of them. (However, you still
// have to iterate over the whole matrix, so it's computationally the product
// of the string lengths either way.)
static size_t upstr_levenshtein_inner(const upstr* one, const upstr* two) {
	// Pair of costs rows of the matrix, which alternate such that one is
	// the current row, and one is the previous.
	size_t* costs_a;
	size_t* costs_b;
	bool a_current = false; // algorithm starts on row one

	// Caches. This function is so performance-critical that's it's worth
	// doing, especially as our strings are indirect but const.
	size_t* CURRROW;
	size_t* PREVROW;
	size_t one_len = one->len;
	size_t two_len = two->len;
	const xmlChar* one_chars = one->chars;
	const xmlChar* two_chars = two->chars;

	size_t result;

	// Allocate space for the cost rows. This must include a column for
	// before the string. Note that this means that the string starts
	// aligned with COLUMN ONE, and that there is NO column for the null.
	size_t required_memory = sizeof(size_t) * (one_len + 1);
	costs_a = malloc(required_memory);
	costs_b = malloc(required_memory);
	
	// Preinit PREVROW, as the results use it, and the loop may be skipped
	PREVROW = costs_a; // Becuase a_current starts false

	// Be robust. This is not desirable if you'd rather be correct.
	if(!costs_a || !costs_b) {
		if(costs_a) { free(costs_a); }
		if(costs_b) { free(costs_b); }
		fprintf(stderr, "Couldn't allocate %zd bytes for Levenshtein comparison! Faking result!\n", required_memory);
		// Take a guess at the difference between the strings being the
		// same as the difference of their lengths.
		return two_len - one_len;
	}

	// Initialize the zeroeth row, a (b is row one, where we start)
#ifdef DEBUG_LEVENSHTEIN_MATRIX
	fprintf(stderr, "   \t");
	for(size_t i = 0; i < one_len; i++ ) {
		fprintf(stderr, "%c\t", one_chars[i]);
	}
	fprintf(stderr, "\n  ");
#endif
	for(size_t col = 0; col <= one_len; col++) {
		costs_a[col] = col;
#ifdef DEBUG_LEVENSHTEIN_MATRIX
		fprintf(stderr, "%zd\t", col);
#endif
	}
#ifdef DEBUG_LEVENSHTEIN_MATRIX
	fprintf(stderr, "\n");
#endif

	// Pulled from inner loop; and loop initialisers: don't need stacking.
	{size_t row, col, cost_d, cost_i, cost_s, col_1; 
	// Step through the rows (remember that zero is before the string)
	for(row = 1; row <= two_len; row++) {
		// Update macro-replacing caches
		if(a_current) { CURRROW = costs_a; PREVROW = costs_b; }
		         else { CURRROW = costs_b; PREVROW = costs_a; }

		// Initialize the zero column for this row
		CURRROW[0] = row;
#ifdef DEBUG_LEVENSHTEIN_MATRIX
		fprintf(stderr, "%c %zd\t", two_chars[row-1], CURRROW[0]);
#endif
		// Step through columns
		for(col = 1; col <= one_len; col++) {
			col_1 = col - 1;
			// Cost of deletion
			cost_d = PREVROW[col] + 1;
			// Cost of insertion
			cost_i = CURRROW[col_1] + 1;
			// Cost of substituion (characters different?)
			cost_s = PREVROW[col_1];
			if(one_chars[col_1] != two_chars[row-1]) { cost_s++; }
			// Compute minimum of the three
			CURRROW[col] =
				(cost_d < cost_i) ?
					(cost_d < cost_s ? cost_d : cost_s) :
					(cost_i < cost_s ? cost_i : cost_s);
#ifdef DEBUG_LEVENSHTEIN_MATRIX
			fprintf(stderr, "%zd\t", CURRROW[col]);
			//fprintf(stderr, "%zd:%zd,%zd,%zd\t", CURRROW[col], cost_d, cost_i, cost_s);
#endif
		}
#ifdef DEBUG_LEVENSHTEIN_MATRIX
		fprintf(stderr, "\n");
#endif
		// Swap rows
		a_current = !a_current;
	}}

	// Update the PREVROW such that we can get the result out of it.
	// Note that CURRROW may not be initialised if we skipped the loop!
	PREVROW = (a_current ? costs_b : costs_a);

	// Record the result from the last column of the last row
	// (We swap as we exit, so this will be the PREVIOUS row)
	result = PREVROW[one_len];
	
	// Free rows and return
	free(costs_a);
	free(costs_b);
	return result;
}

size_t upstr_levenshtein(const upstr* one, const upstr* two) {
	/* Generate some fake strings which parasitically borrow the memory
	 * of the real strings, but do not include any start/end similarity.
	 * This is an optimisation on top of Levenshtein to reduce the search
	 * space at just O(n) cost, instead of the usual O(n^2). */
	upstr liesone;
	upstr liestwo;

	liesone.chars = one->chars;
	liestwo.chars = two->chars;
	liesone.len   = one->len;
	liestwo.len   = two->len;
	liesone.alloc = liestwo.alloc = 0; /* Irrelevant */

	/* Trim from the front. */
	while((*liesone.chars == *liestwo.chars) &&
		(liesone.len > 0) &&
		(liestwo.len > 0)) {

		liesone.chars++; liesone.len--;
		liestwo.chars++; liestwo.len--;
	}

	/* Trim from the rear; this makes these upstrs even less valid, as they
	 * will not be correctly NULL terminated. Thankfully, levenshtein_inner
	 * doesn't care, because it stops short of them using len. */
	while((liesone.len > 1) && (liestwo.len > 1) &&
		(liesone.chars[liesone.len-1] == liestwo.chars[liestwo.len-1])){

		liesone.len--;
		liestwo.len--;
	}

	/* And now do the real distance gruntwork */
	return (liesone.len <= liestwo.len) ?
		upstr_levenshtein_inner(&liesone, &liestwo) :
		upstr_levenshtein_inner(&liestwo, &liesone);
	/* Don't need to free lie strings, as they don't own their memory. */
}

typedef enum { STRING_NEITHER = 0x00, STRING_ONE = 0x01,
	STRING_TWO = 0x02, STRING_BOTH = 0x03 } which_string;
//static const int STRING_MASK_ONE = 0x01;
//static const int STRING_MASK_TWO = 0x02;
size_t upstr_distance(const upstr* one, const upstr* two) {
	xmlChar* near1; /* Start-closest point into string */
	xmlChar* near2;   /* This is where synch was lost */
	xmlChar* far1; /* End-closest point into sting */
	xmlChar* far2;   /* This is how far through we have got */
	size_t proc = 0; /* _Index_ processed to (further far pointer) */
	which_string procstr = STRING_NEITHER; /* Last string to push further */
	size_t distance = 0;
	bool go = true;

	near1 = far1 = one->chars;
	near2 = far2 = two->chars;
	/* End of each string (the NULLs); const to help compiler optimize, and
	 * to avoid them being accidentally trampled, causing humourous bugs. */
	xmlChar* const end1 = one->chars + one->len;
	xmlChar* const end2 = two->chars + two->len;

	DBGDIST("\nDistancing '%s' and '%s'\n", far1, far2);

	while(go) {
		// Horribly verbose performance-tracking spam
		//if(!((far1 - one->chars) % 100)) { fprintf(stderr, "(%zd,%zd)", far1-near1, far2-near2); }

		/* Newly synchronised far pointers; allows the two scans to be
		 * properly independant, else ordering can skew results. */
		xmlChar* synfar1 = 0;
		xmlChar* synfar2 = 0;

		DBGDIST("  %c->%c %c->%c %zd %zd -- ", *near1, *far1, *near2, *far2,
			proc, distance);

		/* Where can we find similarity? */
		if(near2 != far2) {
			for(xmlChar* scan2 = near2; scan2 < far2; scan2++) {
				if(*scan2 == *far1) {
					synfar2 = scan2;
					break;
				}
			}
		}
		if(near1 != far1) {
			for(xmlChar* scan1 = near1; scan1 < far1; scan1++) {
			       	if(*scan1 == *far2) {
					synfar1 = scan1;
					break;
				}
			}
		}

		if(synfar1 && synfar2) {
			/* Risk of comedy alignment miss */
			if((synfar1 - one->chars) > (synfar2 - two->chars)) {
				far1 = synfar1;
			} else {
				far2 = synfar2;
			}
		}
		else if(synfar1) { far1 = synfar1; }
		else if(synfar2) { far2 = synfar2; }

		if(*far1 == *far2) {
			xmlChar* en1, * en2; /* Effective nears */
			/* The far edge of both strings is aligned; mop up any
			 * misalignment, and advance */
			DBGDIST("aligned");
			en1 = near1;
			en2 = near2;
			if(en1 < one->chars + proc-1) { en1 = one->chars + proc-1; }
			if(en2 < two->chars + proc-1) { en2 = two->chars + proc-1; }
			/* Unaligned lengths; const for compiler's benefit again */
			const size_t un1 = (far1 > en1) ? (far1 - en1) : 0;
			const size_t un2 = (far2 > en2) ? (far2 - en2) : 0;
			/* if(un1 == un2) {
				// This /should/ handle perfectly balanced
				// (rectangular) re-alignments, which are an
				// edge case. But it doesn't.
				distance += un1;
				proc = far1 - one->chars;
				proc++;
				procstr = STRING_NEITHER;
			} else */ if(un1 > un2) {
				distance += un1;
				proc = far1 - one->chars;
				if(procstr == STRING_TWO) { proc++; }
				procstr = STRING_ONE;
			} else {
				distance += un2;
				proc = far2 - two->chars;
				if(procstr == STRING_ONE) { proc++; }
				procstr = STRING_TWO;
			}

			if(far1 == end1) {
				go = false;
				//ef = far2;
				//if(ef < two->chars + proc-1) { ef = two->chars + proc-1; }
				DBGDIST("\n    Ending with %d left in 2, d%zd '%s'",
					end2 - far2, distance, far2);
				distance += (end2 - far2);
			} else if(far2 == end2) {
				go = false;
				//ef = far1;
				//if(ef < one->chars + proc-1) { ef = one->chars + proc-1; }
				DBGDIST("\n    Ending with %d left in 1, d%zd '%s'",
					end1 - far1, distance, far1);
				distance += (end1 - far1);
			} else {
				near1 = ++far1;
				near2 = ++far2;
				proc++;
			}
		} else {
			/* The strings are not at all aligned; advance.
			 * Will eventually align on the NULLs. */
			DBGDIST("not");
			if(far1 != end1) { far1++; }
			if(far2 != end2) { far2++; }
		}
		DBGDIST("\n");
	}

	DBGDIST("Final:  %c->%c %c->%c\n", *near1, *far1, *near2, *far2);
	DBGDIST("Final distance: %zd\n", distance);
	return distance;
}

void upstr_destroy(upstr* str) {
	free(str->chars);
	// Cleanly dealloc, such that we can be re-inited
	str->chars = 0;
	str->alloc = 0;
}

