/*
 * Copyright (c) 2023 Walter Alejandro Iglesias <wai@roquesor.com>.
 * My own version of fmt (format paragraphs).
 */

#include <err.h>
#include <errno.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <wchar.h>
#include <wctype.h>
#include <locale.h>

#define MAXWIDTH 1000	/* Limit value passed to '-w' */

/* ASCII */
#define PERIOD		L'.'
#define QUESTION	L'?'
#define EXCLAM		L'!'
#define BACKSLASH	L'\\'
#define ASTERISK	L'*'
#define QUOTEDBL	L'"'
#define APOSTROPHE	L'\''
#define PARENLEFT	L'('
#define PARENRIGHT	L')'
#define BRACKETLEFT	L'['
#define BRACKETRIGHT	L']'
#define BRACELEFT	L'{'
#define BRACERIGHT	L'}'
#define MINUS		L'-'
#define GREATER		L'>'

/* UTF-8 */
#define GUILLEMOTLEFT		L'\u00ab'
#define GUILLEMOTRIGHT		L'\u00bb'
#define QUESTIONDOWN		L'\u00bf'
#define EXCLAMDOWN		L'\u00a1'
#define LEFTDOUBLEQUOTEMARK	L'\u201c'
#define RIGHTDOUBLEQUOTEMARK	L'\u201d'
#define LEFTSINGLEQUOTEMARK	L'\u2018'
#define RIGHTSINGLEQUOTEMARK	L'\u2019'
#define ELLIPSIS		L'\u2026'
#define EMDASH			L'\u2014'
#define ENDASH			L'\u2013'

const wchar_t END_OF_SENTENCE[] = { PERIOD, QUESTION, EXCLAM,
    ELLIPSIS, '\0' };
const wchar_t OQUOTE[] = { QUOTEDBL, APOSTROPHE, PARENLEFT,
    EXCLAMDOWN, QUESTIONDOWN, GUILLEMOTLEFT, LEFTDOUBLEQUOTEMARK,
    LEFTSINGLEQUOTEMARK, ASTERISK, BRACELEFT, BRACKETLEFT, EMDASH,
    ENDASH, MINUS, '\0' };
const wchar_t CQUOTE[] = { QUOTEDBL, APOSTROPHE, PARENRIGHT,
    GUILLEMOTRIGHT, RIGHTDOUBLEQUOTEMARK, RIGHTSINGLEQUOTEMARK, ASTERISK,
    BRACERIGHT, BRACKETRIGHT, EMDASH, ENDASH, MINUS, '\0' };

int width = 72;		/* Default maximum columns */
int troff = 0;		/* Break sentences option */
int dot = 1;		/* Ignore lines beginning with a dot */
int ind_par = 0;	/* Indent the whole paragraph */
int lcase = 0;		/* Allow sentences begin with lower case letter */
int header = 0;		/* Skip mail headers */
int mail = 0;		/* Skip mail headers */
int nested = 0;		/* Skip tbd pic eqn nested code */

wchar_t	*clean_trailing(wchar_t *);
wchar_t	*collapse_whitespace(wchar_t *);
wchar_t	*join_lines(wchar_t *);
wchar_t	*separate_sentences(wchar_t *);
wchar_t	*wrap(wchar_t *);
void	 filecopy(FILE *, FILE *);
size_t	 cquote_count(wchar_t *, size_t);
void	 is_header(wchar_t *);
void	 is_nested(wchar_t *);
int	 is_initial(wchar_t *, size_t);
void	 usage(void);

int
main(int argc, char *argv[])
{
	FILE *fp;
	int option, user_width;

	while ((option = getopt(argc, argv, "bhmnopw:")) != -1)
		switch (option) {
		case 'b':
			troff = 1;
			break;
		case 'h':
			usage();
			break;
		case 'm':
			mail = 1;
			break;
		case 'n':
			dot = 0;
			break;
		case 'o':
			lcase = 1;
			break;
		case 'p':
			ind_par = 1;
			break;
		case 'w':
			user_width = atoi(optarg);
			if (user_width > 0 && user_width <= MAXWIDTH)
				width = user_width;
			else
				errx(1, "'-w' option accepts only positive"
				     " integers up to %d", MAXWIDTH);
			break;
		default:
			usage();
		}

	argc -= optind;
	argv += optind;

	if (argc > 0)
		while (argc-- > 0)
			if ((fp = fopen(*argv++, "r")) == NULL)
				warn("%s", *(argv - 1));
			else {
				filecopy(fp, stdout);
				fclose(fp);
			}
	else
		filecopy(stdin, stdout);

	return errno;
}

void
filecopy(FILE * ifp, FILE * ofp)
{
	size_t i = 0;
	size_t buf = 0;
	wchar_t *p = NULL;
	wint_t c;

	setlocale(LC_CTYPE, "");

	while ((c = getwc(ifp)) != WEOF) {
		if (i == buf) {
			p = realloc(p, (buf + 100) * sizeof(wint_t));
			if (p == NULL)
				err(1, NULL);
			buf += 100;
		}

		/* Strip control characters */
		if (!iswcntrl(c) || (troff && c == 0x01) ||
		     c == '\t' || c == '\n')
			p[i++] = c;
	}
	if (i == buf) {
		p = realloc(p, (buf + 1) * sizeof(wint_t));
		if (p == NULL)
			err(1, NULL);
	}
	p[i] = '\0';

	p = clean_trailing(p);
	p = join_lines(p);
	p = collapse_whitespace(p);
	p = separate_sentences(p);
	p = wrap(p);

	i = 0;
	while (p[i] != '\0') {
		putwc(p[i], ofp);
		i++;
	}

	free(p);
}

wchar_t *
clean_trailing(wchar_t *r)
{
	size_t i, n;
	size_t len;
	wchar_t *s = NULL;

	len = wcslen(r);

	s = malloc(len * sizeof(wint_t));
	if (s == NULL)
		err(1, NULL);

	i = n = 0;
	while (r[n] != '\0') {
		while (iswblank(r[n]) && r[n + wcsspn(&r[n], L" \t")] == '\n')
			n++;
		s[i++] = r[n++];
	}
	if (i == len) {
		s = realloc(s, (len + 1) * sizeof(wint_t));
		if (s == NULL)
			err(1, NULL);
	}
	s[i] = '\0';

	free(r);
	return s;
}

wchar_t *
join_lines(wchar_t *r)
{
	size_t i, n, len;
	wchar_t *s = NULL;
	int dotline, blankline;

	len = wcslen(r);

	s = malloc(len * sizeof(wint_t));
	if (s == NULL)
		err(1, NULL);

	i = n = dotline = blankline = 0;
	while (r[n] != '\0') {
		if (r[n] == '\n' && (n == 0 || r[n + 1] == '\n'))
			blankline = 1;

		if (blankline && r[n] != '\n')
			blankline = 0;

		if (troff || dot) {
			if (r[n] == '\n' &&
			    (r[n + 1] == PERIOD ||
			     r[n + 1] == APOSTROPHE ||
			     r[n + 1] == BACKSLASH))
				dotline = 1;

			/* first column */
			if (n == 0 || r[n - 1] == '\n') {
			    	if (r[n] == PERIOD ||
				    r[n] == APOSTROPHE ||
				    r[n] == BACKSLASH)
					dotline = 1;
				else
					dotline = 0;

				is_nested(&r[n]);
			}
		}

		if (mail && (n == 0 || r[n - 1] == '\n'))
			is_header(&r[n]);

		if (dotline || blankline || nested || header)
			;
		else if (r[n] == '\n' && r[n + 1] != '\0')
			r[n] = ' ';

		s[i++] = r[n++];
	}
	if (i == len) {
		s = realloc(s, (len + 1) * sizeof(wint_t));
		if (s == NULL)
			err(1, NULL);
	}
	s[i] = '\0';

	free(r);
	return s;
}

wchar_t *
collapse_whitespace(wchar_t *r)
{
	size_t i, n, len;
	wchar_t *s = NULL;
	int firstind, dotline, blankline;

	len = wcslen(r);

	s = malloc(len * sizeof(wint_t));
	if (s == NULL)
		err(1, NULL);

	i = n = firstind = dotline = blankline = 0;
	while (r[n] != '\0') {
		if (r[n] == '\n' && (n == 0 || r[n + 1] == '\n'))
			blankline = 1;
		if (blankline && r[n] != '\n')
			blankline = 0;

		if (troff || dot) {
			if (r[n] == '\n' &&
			    (r[n + 1] == PERIOD ||
			     r[n + 1] == APOSTROPHE ||
			     r[n + 1] == BACKSLASH))
				dotline = 1;

			/* first column */
			if (n == 0 || r[n - 1] == '\n') {
			    	if (r[n] == PERIOD ||
				    r[n] == APOSTROPHE ||
				    r[n] == BACKSLASH)
					dotline = 1;
				else
					dotline = 0;

				is_nested(&r[n]);
			}
		}

		if (mail && (n == 0 || r[n - 1] == '\n'))
			is_header(&r[n]);

		if (n == 0 || r[n - 1] == '\n')
			firstind = 1;
		if (firstind && !iswblank(r[n]))
			firstind = 0;

		if (dotline || blankline || nested || header)
			;
		else if (!firstind) {
			while (iswblank(r[n]) && iswblank(r[n + 1]))
				n++;
			if (r[n] == '\t')
				r[n] = ' ';
		}
		s[i++] = r[n++];
	}

	if (i == len) {
		s = realloc(s, (len + 1) * sizeof(wint_t));
		if (s == NULL)
			err(1, NULL);
	}
	s[i] = '\0';

	free(r);
	return s;
}

wchar_t *
separate_sentences(wchar_t *r)
{
	size_t i, n, len;
	size_t buf;
	wchar_t *s = NULL;
	int dotline, blankline;

	len = wcslen(r);

	s = malloc(len * sizeof(wint_t));
	if (s == NULL)
		err(1, NULL);
	buf = len;

	i = n = dotline = blankline = 0;
	while (r[n] != '\0') {
		if (r[n] == '\n' && (n == 0 || r[n + 1] == '\n'))
			blankline = 1;
		if (blankline && r[n] != '\n')
			blankline = 0;

		if (troff || dot) {
			if (r[n] == '\n' &&
			    (r[n + 1] == PERIOD ||
			     r[n + 1] == APOSTROPHE ||
			     r[n + 1] == BACKSLASH))
				dotline = 1;

			/* first column */
			if (n == 0 || r[n - 1] == '\n') {
			    	if (r[n] == PERIOD ||
				    r[n] == APOSTROPHE ||
				    r[n] == BACKSLASH)
					dotline = 1;
				else
					dotline = 0;

				is_nested(&r[n]);
			}
		}

		if (mail && (n == 0 || r[n - 1] == '\n'))
			is_header(&r[n]);

		if (dotline || blankline || nested || header)
			;
		else if (iswspace(r[n]) && r[n + 1] != '\0') {
			if (is_initial(&r[n], n))
				;
			/* End of sentence */
			else if (n >= 1 &&
			    wcsspn(&r[n - 1 + cquote_count(&r[n - 1], n - 1)],
			    END_OF_SENTENCE) > 0
				 &&
			/* Begin of next sentence */
			    (lcase || r[n + 1 + wcsspn(&r[n + 1],
			    OQUOTE)] == BACKSLASH ||
			    iswupper(r[n + 1 + wcsspn(&r[n + 1],
			    OQUOTE)]))) {
				if (troff)
					r[n] = '\n';
				else {
					s = realloc(s,
					    (buf + 1) * sizeof(wint_t));
					if (s == NULL)
						err(1, NULL);
					buf++;
					s[i++] = ' ';
					r[n] = ' ';
				}
			}
		}
		s[i++] = r[n++];
	}
	if (i == buf) {
		s = realloc(s, (buf + 1) * sizeof(wint_t));
		if (s == NULL)
			err(1, NULL);
	}
	s[i] = '\0';

	free(r);
	return s;
}

wchar_t *
wrap(wchar_t *r)
{
	size_t i, n, sp, len, tmp, col, blank, word_width;
	size_t buf;
	wchar_t *s = NULL;
	wchar_t *ind = NULL;
	int blankline, dotline;

	len = wcslen(r);

	s = malloc(len * sizeof(wint_t));
	if (s == NULL)
		err(1, NULL);
	buf = len;

	col = 1;
	i = n = dotline = blankline = sp = 0;
	while (r[n] != '\0') {
		if (r[n] == '\n' && (n == 0 || r[n + 1] == '\n'))
			blankline = 1;
		if (blankline && r[n] != '\n')
			blankline = 0;

		if (troff || dot) {
			if (r[n] == '\n' &&
			   (r[n + 1] == PERIOD ||
			    r[n + 1] == APOSTROPHE ||
			    r[n + 1] == BACKSLASH))
				dotline = 1;

			/* first column */
			if (n == 0 || r[n - 1] == '\n') {
			    	if (r[n] == PERIOD ||
				    r[n] == APOSTROPHE ||
				    r[n] == BACKSLASH)
					dotline = 1;
				else
					dotline = 0;
				is_nested(&r[n]);
			}
		}

		if (mail && (n == 0 || r[n - 1] == '\n'))
			is_header(&r[n]);

		/* Copy first line indentation (-p) */
		if (!ind_par)
			;
		else if (n == 0 && !blankline && !dotline &&
		    !nested && !header) {
			sp = wcsspn(&r[n], L" \t");
			if (sp != 0) {
				ind = malloc((sp + 1) * sizeof(wint_t));
				if (ind == NULL)
					err(1, NULL);
				tmp = 0;
				while (tmp < sp)
					ind[tmp++] = r[n++];
				ind[tmp] = '\0';
				n -= sp;
			}
		} else if (dotline || blankline || nested || header) {
			sp = wcsspn(&r[n + 1], L" \t");
			if (sp != 0) {
				ind = realloc(ind, (sp + 1) * sizeof(wint_t));
				if (ind == NULL)
					err(1, NULL);
				tmp = 0;
				while (tmp < sp) {
					ind[tmp++] = r[n + 1];
					n++;
				}
				ind[tmp] = '\0';
				n -= sp;
			}
		}
		if (dotline || blankline || nested || header)
			;
		else if (iswspace(r[n]) && (n == 0 || !iswblank(r[n - 1]))) {
			/* Wrap lines */
			blank = wcsspn(&r[n], L" \t");
			word_width = wcscspn(&r[n + blank], L" \t");
			if (col + blank + word_width > width + 1 &&
			    r[n + 1] != '\0') {
				while (iswblank(r[n + 1]))
					n++;
				r[n] = '\n';
			}
		}

		if (r[n] == '\n')
			col = 0;

		if (ind_par && !blankline && !dotline && !nested &&
		    !header && sp != 0 &&
		    r[n] == '\n' && r[n + 1] != '\0') {
			s[i++] = '\n';
			n++;
			col = 1;

			s = realloc(s, (buf + wcslen(ind)) * sizeof(wint_t));
			if (s == NULL)
				err(1, NULL);
			buf += wcslen(ind);

			tmp = 0;
			while (ind[tmp] != '\0') {
				if (ind[tmp] == '\t')
					col += 8;
				else
					col++;

				s[i++] = ind[tmp++];
			}
		}

		if (r[n] == '\t')
			col += 8;
		else
			col++;

		s[i++] = r[n++];
	}

	if (i == buf) {
		s = realloc(s, (buf + 1) * sizeof(wint_t));
		if (s == NULL)
			err(1, NULL);
	}
	s[i] = '\0';

	if (ind_par)
		free(ind);
	free(r);
	return s;
}

size_t
cquote_count(wchar_t *s, size_t n)
{
	size_t i;
	for (i = 0; wcsspn(&s[i], CQUOTE) > 0 && n + i >= 0; i--)
		;
	return i;
}

void
is_nested(wchar_t *s)
{
	if (wcsncmp(s, L".TS", 3) == 0 ||
	    wcsncmp(s, L".PS", 3) == 0 ||
	    wcsncmp(s, L".EQ", 3) == 0)
		nested = 1;
	if (wcsncmp(s, L".TE", 3) == 0 ||
	    wcsncmp(s, L".PE", 3) == 0 ||
	    wcsncmp(s, L".EN", 3) == 0)
		nested = 0;
}

void
is_header(wchar_t *s)
{
	if (wcsncmp(s, L"From ", 5) == 0)
		header = 1;
	if (*s == GREATER)
		header = 1;
	if (*s == '\n')
		header = 0;
}

/*
 * Try to recognize some initialisms and abbreviations.
 */
int
is_initial(wchar_t *r, size_t i)
{
	int ret = 1, qcount;
	size_t n = 0;

	if (i >= 1)
		qcount = cquote_count(&r[n - 1], n - 1);

	 /* 1. One letter alone + period.  */
	if (i == 2 && r[n - 1 + qcount] == PERIOD &&
	    iswalpha(r[n - 2 + qcount]))
			;

	else if (i >= 3 && r[n - 1 + qcount] == PERIOD &&
	    iswspace(r[n - 3 + qcount]) && iswalpha(r[n - 2 + qcount]))
			;

	else if (i >= 4 && r[n - 1 + qcount] == PERIOD &&
	    iswspace(r[n - 4 + qcount]) &&
	    wcsspn(&r[n - 3 + qcount], OQUOTE) &&
	    iswalpha(r[n - 2 + qcount]))
		;

	/* 3. Abbreviations (added some from Spanish.) */
	else if (((i == 3 && r[n - 1 + qcount] == PERIOD) ||
	   (i >= 4 && r[n - 1 + qcount] == PERIOD &&
	   (iswspace(r[n - 4 + qcount]) ||
	    wcsspn(&r[n - 4 + qcount], OQUOTE))))
		&&
	   (wcsncmp(&r[n - 3 + qcount], L"Mr", 2) == 0 ||
	    wcsncmp(&r[n - 3 + qcount], L"Ms", 2) == 0 ||
	    wcsncmp(&r[n - 3 + qcount], L"Dr", 2) == 0 ||
	    wcsncmp(&r[n - 3 + qcount], L"Sr", 2) == 0 ||
	    wcsncmp(&r[n - 3 + qcount], L"Ed", 2) == 0 ||
	    wcsncmp(&r[n - 3 + qcount], L"ed", 2) == 0 ||
	    wcsncmp(&r[n - 3 + qcount], L"vs", 2) == 0 ||
	  ((wcsncmp(&r[n - 3 + qcount], L"EE", 2) == 0) &&
	     wcsncmp(&r[n + 1], L"UU.", 3) == 0)))
		;

	else if (((i == 4 && r[n - 1 + qcount] == PERIOD) ||
	   (i >= 5 && r[n - 1 + qcount] == PERIOD &&
	   (iswspace(r[n - 5 + qcount]) ||
	    wcsspn(&r[n - 5 + qcount], OQUOTE))))
		&&
	   (wcsncmp(&r[n - 4 + qcount], L"i.e", 3) == 0 ||
	    wcsncmp(&r[n - 4 + qcount], L"e.g", 3) == 0 ||
	    wcsncmp(&r[n - 4 + qcount], L"Mrs", 3) == 0 ||
	    wcsncmp(&r[n - 4 + qcount], L"Fig", 3) == 0 ||
	    wcsncmp(&r[n - 4 + qcount], L"Dra", 3) == 0 ||
	    wcsncmp(&r[n - 4 + qcount], L"Sra", 3) == 0))
		;

	else if (i >= 5 && r[n - 1 + qcount] == PERIOD &&
	   (wcsncmp(&r[n - 5 + qcount], L"Prof", 4) == 0 ||
	    wcsncmp(&r[n - 5 + qcount], L"Ibid", 4) == 0))
		;

	/*
	 * Skip ASCII ellipsis at the beggining of a
	 * sentence
	 */
	else if (i == 3 && wcsncmp(&r[n - 3], L"...", 3) == 0)
		;
	else if (i >= 4 && wcsncmp(&r[n - 4], L" ...", 4) == 0)
		;
	else
		ret = 0;

	return ret;
}

void
usage(void)
{
	extern char *__progname;

	fprintf(stderr,
		"Usage: %s [-bhlnp] [-w width] [file ...]\n"
		"  -b   break sentences with a new line\n"
		"  -h   print this help\n"
		"  -m   try to skip mail headers and quoted text\n"
		"  -n   format also lines beginning with a dot character\n"
		"  -o   lowercase letters can begin a sentence (you "
		"may need this with\n"
		"         man pages)\n"
		"  -p   indent the whole paragraph copying the first line "
		"indentation\n"
		"  -w   set maximum line width (default 72 columns)\n",
		__progname);
	exit(1);
}
