ldc/dmd2/html.c


// Copyright (c) 1999-2006 by Digital Mars
// All Rights Reserved
// written by Walter Bright
// http://www.digitalmars.com
// License for redistribution is by either the Artistic License
// in artistic.txt, or the GNU General Public License in gnu.txt.
// See the included readme.txt for details.


/* HTML parser
 */

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#include <errno.h>
#include <wchar.h>

#include "mars.h"
#include "html.h"

#include <assert.h>
#include "root.h"

extern int HtmlNamedEntity(unsigned char *p, int length);

static int isLineSeparator(const unsigned char* p);

/**********************************
 * Determine if beginning of tag identifier
 * or a continuation of a tag identifier.
 */

inline int istagstart(int c)
{
    return (isalpha(c) || c == '_');
}

inline int istag(int c)
{
    return (isalnum(c) || c == '_');
}

/**********************************************
 */

Html::Html(const char *sourcename, unsigned char *base, unsigned length)
{
    //printf("Html::Html()\n");
    this->sourcename = sourcename;
    this->base = base;
    p = base;
    end = base + length;
    linnum = 1;
    dbuf = NULL;
    inCode = 0;
}

/**********************************************
 * Print error & quit.
 */

void Html::error(const char *format, ...)
{
    if (!global.gag)
    {
	printf("%s(%d) : HTML Error: ", sourcename, linnum);

	va_list ap;
	va_start(ap, format);
	vprintf(format, ap);
	va_end(ap);

	printf("\n");
	fflush(stdout);
    }

    global.errors++;
}

/**********************************************
 * Extract all the code from an HTML file,
 * concatenate it all together, and store in buf.
 */

void Html::extractCode(OutBuffer *buf)
{
    //printf("Html::extractCode()\n");
    dbuf = buf;			// save for other routines
    buf->reserve(end - p);
    inCode = 0;
    while (1)
    {
	//printf("p = %p, *p = x%x\n", p, *p);
	switch (*p)
	{
#if 0 // strings are not recognized outside of tags
	    case '"':
	    case '\'':
		skipString();
		continue;
#endif
	    case '<':
		if (p[1] == '!' && isCommentStart())
		{   // Comments start with <!--
		    scanComment();
		}
  		else if(p[1] == '!' && isCDATAStart())
  		{
  		    scanCDATA();
  		}
		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
		    skipTag();
		else if (istagstart(*skipWhite(p + 1)))
		    skipTag();
		else
		    goto Ldefault;
		continue;

	    case 0:
	    case 0x1a:
		break;		// end of file

	    case '&':
		if (inCode)
		{   // Translate character entity into ascii for D parser
		    int c;

		    c = charEntity();
		    buf->writeUTF8(c);
		}
		else
		    p++;
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that D lexer counts the
		// lines right.
		buf->writeByte(*p);
		p++;
		continue;

	    default:
	    Ldefault:
		if (inCode)
		    buf->writeByte(*p);
		p++;
		continue;
	}
	break;
    }
    buf->writeByte(0);				// ending sentinel
    //printf("D code is: '%s'\n", (char *)buf->data);
}

/***********************************************
 * Scan to end of <> tag.
 * Look for <code> and </code> tags to start/stop D processing.
 * Input:
 *	p is on opening '<' of tag; it's already verified that
 *	it's a tag by lookahead
 * Output:
 *	p is past closing '>' of tag
 */

void Html::skipTag()
{
    enum TagState	// what parsing state we're in
    {
	TStagstart,	// start of tag name
	TStag,		// in a tag name
	TSrest,		// following tag name
    };
    enum TagState state = TStagstart;
    int inot;
    unsigned char *tagstart = NULL;
    int taglen = 0;

    p++;
    inot = 0;
    if (*p == '/')
    {	inot = 1;
	p++;
    }
    while (1)
    {
	switch (*p)
	{
	    case '>':		// found end of tag
		p++;
		break;

	    case '"':
	    case '\'':
		state = TSrest;
		skipString();
		continue;

	    case '<':
		if (p[1] == '!' && isCommentStart())
		{   // Comments start with <!--
		    scanComment();
		}
		else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
		{   error("nested tag");
		    skipTag();
		}
		else if (istagstart(*skipWhite(p + 1)))
		{   error("nested tag");
		    skipTag();
		}
		// Treat comments as if they were whitespace
		state = TSrest;
		continue;

	    case 0:
	    case 0x1a:
		error("end of file before end of tag");
		break;		// end of file

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that code lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		state = TSrest;			// end of tag
		p++;
		continue;

	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
		if (state == TStagstart)
		{   p++;
		    continue;
		}
	    default:
	    Ldefault:
		switch (state)
		{
		    case TStagstart:		// start of tag name
			assert(istagstart(*p));
			state = TStag;
			tagstart = p;
			taglen = 0;
			break;

		    case TStag:
			if (istag(*p))
			{   // Continuing tag name
			    taglen++;
			}
			else
			{   // End of tag name
			    state = TSrest;
			}
			break;

		    case TSrest:
			break;
		}
		p++;
		continue;
	}
	break;
    }

    // See if we parsed a <code> or </code> tag
    if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
	&& *(p - 2) != '/') // ignore "<code />" (XHTML)
    {
	if (inot)
	{   inCode--;
	    if (inCode < 0)
		inCode = 0;		// ignore extra </code>'s
	}
	else
	    inCode++;
    }
}

/***********************************************
 * Scan to end of attribute string.
 */

void Html::skipString()
{
    int tc = *p;

    while (1)
    {
	p++;
	switch (*p)
	{
	    case '"':
	    case '\'':
		if (*p == tc)
		{   p++;
		    break;
		}
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;
		// Always extract new lines, so that D lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		continue;

	    case 0:
	    case 0x1a:
	    Leof:
		error("end of file before closing %c of string", tc);
		break;

	    default:
	    Ldefault:
		continue;
	}
	break;
    }
}

/*********************************
 * If p points to any white space, skip it
 * and return pointer just past it.
 */

unsigned char *Html::skipWhite(unsigned char *q)
{
    for (; 1; q++)
    {
	switch (*q)
	{
	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
	    case '\r':
	    case '\n':
		continue;

	    default:
		break;
	}
	break;
    }
    return q;
}

/***************************************************
 * Scan to end of comment.
 * Comments are defined any of a number of ways.
 * IE 5.0: <!-- followed by >
 * "HTML The Definitive Guide": <!-- text with at least one space in it -->
 * Netscape: <!-- --> comments nest
 * w3c: whitespace can appear between -- and > of comment close
 */

void Html::scanComment()
{
    // Most of the complexity is dealing with the case that
    // an arbitrary amount of whitespace can appear between
    // the -- and the > of a comment close.
    int scangt = 0;

    //printf("scanComment()\n");
    if (*p == '\n')
    {	linnum++;
	// Always extract new lines, so that D lexer counts the
	// lines right.
	dbuf->writeByte(*p);
    }
    while (1)
    {
	//scangt = 1;			// IE 5.0 compatibility
	p++;
	switch (*p)
	{
	    case '-':
		if (p[1] == '-')
		{
		    if (p[2] == '>')	// optimize for most common case
		    {
			p += 3;
			break;
		    }
		    p++;
		    scangt = 1;
		}
		else
		    scangt = 0;
		continue;

	    case '>':
		if (scangt)
		{   // found -->
		    p++;
		    break;
		}
		continue;

	    case ' ':
	    case '\t':
	    case '\f':
	    case '\v':
		// skip white space
		continue;

	    case '\r':
		if (p[1] == '\n')
		    goto Ldefault;
	    case '\n':
		linnum++;		// remember to count lines
		// Always extract new lines, so that D lexer counts the
		// lines right.
		dbuf->writeByte(*p);
		continue;

	    case 0:
	    case 0x1a:
		error("end of file before closing --> of comment");
		break;

	    default:
	    Ldefault:
		scangt = 0;		// it's not -->
		continue;
	}
	break;
    }
    //printf("*p = '%c'\n", *p);
}

/********************************************
 * Determine if we are at the start of a comment.
 * Input:
 *	p is on the opening '<'
 * Returns:
 *	0 if not start of a comment
 * 	1 if start of a comment, p is adjusted to point past --
 */

int Html::isCommentStart()
#ifdef __DMC__
    __out(result)
    {
	if (result == 0)
	    ;
	else if (result == 1)
	{
	    assert(p[-2] == '-' && p[-1] == '-');
	}
	else
	    assert(0);
    }
    __body
#endif /* __DMC__ */
    {	unsigned char *s;

	if (p[0] == '<' && p[1] == '!')
	{
	    for (s = p + 2; 1; s++)
	    {
		switch (*s)
		{
		    case ' ':
		    case '\t':
		    case '\r':
		    case '\f':
		    case '\v':
			// skip white space, even though spec says no
			// white space is allowed
			continue;

		    case '-':
			if (s[1] == '-')
			{
			    p = s + 2;
			    return 1;
			}
			goto No;

		    default:
			goto No;
		}
	    }
	}
    No:
	return 0;
    }

int Html::isCDATAStart()
{
    const char * CDATA_START_MARKER = "<![CDATA[";
    size_t len = strlen(CDATA_START_MARKER);

    if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
    {
	p += len;
	return 1;
    }
    else
    {
	return 0;
    }
}

void Html::scanCDATA()
{
    while(*p && *p != 0x1A)
    {
	int lineSepLength = isLineSeparator(p);
	if (lineSepLength>0)
	{
	    /* Always extract new lines, so that D lexer counts the lines
	     * right.
	     */
	    linnum++;
	    dbuf->writeUTF8('\n');
	    p += lineSepLength;
	    continue;
        }
	else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
	{
	    /* end of CDATA section */
	    p += 3;
	    return;
	}
	else if (inCode)
	{
	    /* this CDATA section contains D code */
	    dbuf->writeByte(*p);
	}

	p++;
    }
}

/********************************************
 * Convert an HTML character entity into a character.
 * Forms are:
 *	&name;		named entity
 *	&#ddd;		decimal
 *	&#xhhhh;	hex
 * Input:
 *	p is on the &
 */

int Html::charEntity()
{   int c = 0;
    int v;
    int hex;
    unsigned char *pstart = p;

    //printf("Html::charEntity('%c')\n", *p);
    if (p[1] == '#')
    {
	p++;
	if (p[1] == 'x' || p[1] == 'X')
	{   p++;
	    hex = 1;
	}
	else
	    hex = 0;
	if (p[1] == ';')
	    goto Linvalid;
	while (1)
	{
	    p++;
	    switch (*p)
	    {
		case 0:
		case 0x1a:
		    error("end of file before end of character entity");
		    goto Lignore;

		case '\n':
		case '\r':
		case '<':	// tag start
		    // Termination is assumed
		    break;

		case ';':
		    // Termination is explicit
		    p++;
		    break;

		case '0': case '1': case '2': case '3': case '4':
		case '5': case '6': case '7': case '8': case '9':
		    v = *p - '0';
		    goto Lvalue;

		case 'a': case 'b': case 'c':
		case 'd': case 'e': case 'f':
		    if (!hex)
			goto Linvalid;
		    v = (*p - 'a') + 10;
		    goto Lvalue;

		case 'A': case 'B': case 'C':
		case 'D': case 'E': case 'F':
		    if (!hex)
			goto Linvalid;
		    v = (*p - 'A') + 10;
		    goto Lvalue;

		Lvalue:
		    if (hex)
			c = (c << 4) + v;
		    else
			c = (c * 10) + v;
		    if (c > 0x10FFFF)
		    {
			error("character entity out of range");
			goto Lignore;
		    }
		    continue;

		default:
		Linvalid:
		    error("invalid numeric character reference");
		    goto Lignore;
	    }
	    break;
	}
    }
    else
    {
	// It's a named entity; gather all characters until ;
	unsigned char *idstart = p + 1;

	while (1)
	{
	    p++;
	    switch (*p)
	    {
		case 0:
		case 0x1a:
		    error("end of file before end of character entity");
		    break;

		case '\n':
		case '\r':
		case '<':	// tag start
		    // Termination is assumed
		    c = HtmlNamedEntity(idstart, p - idstart);
		    if (c == -1)
			goto Lignore;
		    break;

		case ';':
		    // Termination is explicit
		    c = HtmlNamedEntity(idstart, p - idstart);
		    if (c == -1)
			goto Lignore;
		    p++;
		    break;

		default:
		    continue;
	    }
	    break;
	}
    }

    // Kludge to convert non-breaking space to ascii space
    if (c == 160)
	c = ' ';

    return c;

Lignore:
    //printf("Lignore\n");
    p = pstart + 1;
    return '&';
}

/**
 * identify DOS, Linux, Mac, Next and Unicode line endings
 * 0 if this is no line separator
 * >0 the length of the separator
 * Note: input has to be UTF-8
 */
static int isLineSeparator(const unsigned char* p)
{
    // Linux
    if( p[0]=='\n')
	return 1;

    // Mac & Dos
    if( p[0]=='\r')
	return (p[1]=='\n') ? 2 : 1;

    // Unicode (line || paragraph sep.)
    if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
	return 3;

    // Next
    if( p[0]==0xC2 && p[1]==0x85)
	return 2;

    return 0;
}