mirror of
https://github.com/ldc-developers/ldc.git
synced 2025-05-08 03:46:02 +03:00
718 lines
13 KiB
C
718 lines
13 KiB
C
|
|
// Copyright (c) 1999-2006 by Digital Mars
|
|
// All Rights Reserved
|
|
// written by Walter Bright
|
|
// http://www.digitalmars.com
|
|
// License for redistribution is by either the Artistic License
|
|
// in artistic.txt, or the GNU General Public License in gnu.txt.
|
|
// See the included readme.txt for details.
|
|
|
|
|
|
/* HTML parser
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <stdarg.h>
|
|
#include <errno.h>
|
|
#include <wchar.h>
|
|
|
|
#include "mars.h"
|
|
#include "html.h"
|
|
|
|
#include <assert.h>
|
|
#include "root.h"
|
|
|
|
extern int HtmlNamedEntity(unsigned char *p, int length);
|
|
|
|
static int isLineSeparator(const unsigned char* p);
|
|
|
|
/**********************************
|
|
* Determine if beginning of tag identifier
|
|
* or a continuation of a tag identifier.
|
|
*/
|
|
|
|
inline int istagstart(int c)
|
|
{
|
|
return (isalpha(c) || c == '_');
|
|
}
|
|
|
|
inline int istag(int c)
|
|
{
|
|
return (isalnum(c) || c == '_');
|
|
}
|
|
|
|
/**********************************************
|
|
*/
|
|
|
|
Html::Html(const char *sourcename, unsigned char *base, unsigned length)
|
|
{
|
|
//printf("Html::Html()\n");
|
|
this->sourcename = sourcename;
|
|
this->base = base;
|
|
p = base;
|
|
end = base + length;
|
|
linnum = 1;
|
|
dbuf = NULL;
|
|
inCode = 0;
|
|
}
|
|
|
|
/**********************************************
|
|
* Print error & quit.
|
|
*/
|
|
|
|
void Html::error(const char *format, ...)
|
|
{
|
|
if (!global.gag)
|
|
{
|
|
printf("%s(%d) : HTML Error: ", sourcename, linnum);
|
|
|
|
va_list ap;
|
|
va_start(ap, format);
|
|
vprintf(format, ap);
|
|
va_end(ap);
|
|
|
|
printf("\n");
|
|
fflush(stdout);
|
|
}
|
|
|
|
global.errors++;
|
|
}
|
|
|
|
/**********************************************
|
|
* Extract all the code from an HTML file,
|
|
* concatenate it all together, and store in buf.
|
|
*/
|
|
|
|
void Html::extractCode(OutBuffer *buf)
|
|
{
|
|
//printf("Html::extractCode()\n");
|
|
dbuf = buf; // save for other routines
|
|
buf->reserve(end - p);
|
|
inCode = 0;
|
|
while (1)
|
|
{
|
|
//printf("p = %p, *p = x%x\n", p, *p);
|
|
switch (*p)
|
|
{
|
|
#if 0 // strings are not recognized outside of tags
|
|
case '"':
|
|
case '\'':
|
|
skipString();
|
|
continue;
|
|
#endif
|
|
case '<':
|
|
if (p[1] == '!' && isCommentStart())
|
|
{ // Comments start with <!--
|
|
scanComment();
|
|
}
|
|
else if(p[1] == '!' && isCDATAStart())
|
|
{
|
|
scanCDATA();
|
|
}
|
|
else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
|
|
skipTag();
|
|
else if (istagstart(*skipWhite(p + 1)))
|
|
skipTag();
|
|
else
|
|
goto Ldefault;
|
|
continue;
|
|
|
|
case 0:
|
|
case 0x1a:
|
|
break; // end of file
|
|
|
|
case '&':
|
|
if (inCode)
|
|
{ // Translate character entity into ascii for D parser
|
|
int c;
|
|
|
|
c = charEntity();
|
|
buf->writeUTF8(c);
|
|
}
|
|
else
|
|
p++;
|
|
continue;
|
|
|
|
case '\r':
|
|
if (p[1] == '\n')
|
|
goto Ldefault;
|
|
case '\n':
|
|
linnum++;
|
|
// Always extract new lines, so that D lexer counts the
|
|
// lines right.
|
|
buf->writeByte(*p);
|
|
p++;
|
|
continue;
|
|
|
|
default:
|
|
Ldefault:
|
|
if (inCode)
|
|
buf->writeByte(*p);
|
|
p++;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
buf->writeByte(0); // ending sentinel
|
|
//printf("D code is: '%s'\n", (char *)buf->data);
|
|
}
|
|
|
|
/***********************************************
|
|
* Scan to end of <> tag.
|
|
* Look for <code> and </code> tags to start/stop D processing.
|
|
* Input:
|
|
* p is on opening '<' of tag; it's already verified that
|
|
* it's a tag by lookahead
|
|
* Output:
|
|
* p is past closing '>' of tag
|
|
*/
|
|
|
|
void Html::skipTag()
|
|
{
|
|
enum TagState // what parsing state we're in
|
|
{
|
|
TStagstart, // start of tag name
|
|
TStag, // in a tag name
|
|
TSrest, // following tag name
|
|
};
|
|
enum TagState state = TStagstart;
|
|
int inot;
|
|
unsigned char *tagstart = NULL;
|
|
int taglen = 0;
|
|
|
|
p++;
|
|
inot = 0;
|
|
if (*p == '/')
|
|
{ inot = 1;
|
|
p++;
|
|
}
|
|
while (1)
|
|
{
|
|
switch (*p)
|
|
{
|
|
case '>': // found end of tag
|
|
p++;
|
|
break;
|
|
|
|
case '"':
|
|
case '\'':
|
|
state = TSrest;
|
|
skipString();
|
|
continue;
|
|
|
|
case '<':
|
|
if (p[1] == '!' && isCommentStart())
|
|
{ // Comments start with <!--
|
|
scanComment();
|
|
}
|
|
else if (p[1] == '/' && istagstart(*skipWhite(p + 2)))
|
|
{ error("nested tag");
|
|
skipTag();
|
|
}
|
|
else if (istagstart(*skipWhite(p + 1)))
|
|
{ error("nested tag");
|
|
skipTag();
|
|
}
|
|
// Treat comments as if they were whitespace
|
|
state = TSrest;
|
|
continue;
|
|
|
|
case 0:
|
|
case 0x1a:
|
|
error("end of file before end of tag");
|
|
break; // end of file
|
|
|
|
case '\r':
|
|
if (p[1] == '\n')
|
|
goto Ldefault;
|
|
case '\n':
|
|
linnum++;
|
|
// Always extract new lines, so that code lexer counts the
|
|
// lines right.
|
|
dbuf->writeByte(*p);
|
|
state = TSrest; // end of tag
|
|
p++;
|
|
continue;
|
|
|
|
case ' ':
|
|
case '\t':
|
|
case '\f':
|
|
case '\v':
|
|
if (state == TStagstart)
|
|
{ p++;
|
|
continue;
|
|
}
|
|
default:
|
|
Ldefault:
|
|
switch (state)
|
|
{
|
|
case TStagstart: // start of tag name
|
|
assert(istagstart(*p));
|
|
state = TStag;
|
|
tagstart = p;
|
|
taglen = 0;
|
|
break;
|
|
|
|
case TStag:
|
|
if (istag(*p))
|
|
{ // Continuing tag name
|
|
taglen++;
|
|
}
|
|
else
|
|
{ // End of tag name
|
|
state = TSrest;
|
|
}
|
|
break;
|
|
|
|
case TSrest:
|
|
break;
|
|
}
|
|
p++;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// See if we parsed a <code> or </code> tag
|
|
if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0
|
|
&& *(p - 2) != '/') // ignore "<code />" (XHTML)
|
|
{
|
|
if (inot)
|
|
{ inCode--;
|
|
if (inCode < 0)
|
|
inCode = 0; // ignore extra </code>'s
|
|
}
|
|
else
|
|
inCode++;
|
|
}
|
|
}
|
|
|
|
/***********************************************
|
|
* Scan to end of attribute string.
|
|
*/
|
|
|
|
void Html::skipString()
|
|
{
|
|
int tc = *p;
|
|
|
|
while (1)
|
|
{
|
|
p++;
|
|
switch (*p)
|
|
{
|
|
case '"':
|
|
case '\'':
|
|
if (*p == tc)
|
|
{ p++;
|
|
break;
|
|
}
|
|
continue;
|
|
|
|
case '\r':
|
|
if (p[1] == '\n')
|
|
goto Ldefault;
|
|
case '\n':
|
|
linnum++;
|
|
// Always extract new lines, so that D lexer counts the
|
|
// lines right.
|
|
dbuf->writeByte(*p);
|
|
continue;
|
|
|
|
case 0:
|
|
case 0x1a:
|
|
Leof:
|
|
error("end of file before closing %c of string", tc);
|
|
break;
|
|
|
|
default:
|
|
Ldefault:
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*********************************
|
|
* If p points to any white space, skip it
|
|
* and return pointer just past it.
|
|
*/
|
|
|
|
unsigned char *Html::skipWhite(unsigned char *q)
|
|
{
|
|
for (; 1; q++)
|
|
{
|
|
switch (*q)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\f':
|
|
case '\v':
|
|
case '\r':
|
|
case '\n':
|
|
continue;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
/***************************************************
|
|
* Scan to end of comment.
|
|
* Comments are defined any of a number of ways.
|
|
* IE 5.0: <!-- followed by >
|
|
* "HTML The Definitive Guide": <!-- text with at least one space in it -->
|
|
* Netscape: <!-- --> comments nest
|
|
* w3c: whitespace can appear between -- and > of comment close
|
|
*/
|
|
|
|
void Html::scanComment()
|
|
{
|
|
// Most of the complexity is dealing with the case that
|
|
// an arbitrary amount of whitespace can appear between
|
|
// the -- and the > of a comment close.
|
|
int scangt = 0;
|
|
|
|
//printf("scanComment()\n");
|
|
if (*p == '\n')
|
|
{ linnum++;
|
|
// Always extract new lines, so that D lexer counts the
|
|
// lines right.
|
|
dbuf->writeByte(*p);
|
|
}
|
|
while (1)
|
|
{
|
|
//scangt = 1; // IE 5.0 compatibility
|
|
p++;
|
|
switch (*p)
|
|
{
|
|
case '-':
|
|
if (p[1] == '-')
|
|
{
|
|
if (p[2] == '>') // optimize for most common case
|
|
{
|
|
p += 3;
|
|
break;
|
|
}
|
|
p++;
|
|
scangt = 1;
|
|
}
|
|
else
|
|
scangt = 0;
|
|
continue;
|
|
|
|
case '>':
|
|
if (scangt)
|
|
{ // found -->
|
|
p++;
|
|
break;
|
|
}
|
|
continue;
|
|
|
|
case ' ':
|
|
case '\t':
|
|
case '\f':
|
|
case '\v':
|
|
// skip white space
|
|
continue;
|
|
|
|
case '\r':
|
|
if (p[1] == '\n')
|
|
goto Ldefault;
|
|
case '\n':
|
|
linnum++; // remember to count lines
|
|
// Always extract new lines, so that D lexer counts the
|
|
// lines right.
|
|
dbuf->writeByte(*p);
|
|
continue;
|
|
|
|
case 0:
|
|
case 0x1a:
|
|
error("end of file before closing --> of comment");
|
|
break;
|
|
|
|
default:
|
|
Ldefault:
|
|
scangt = 0; // it's not -->
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
//printf("*p = '%c'\n", *p);
|
|
}
|
|
|
|
/********************************************
|
|
* Determine if we are at the start of a comment.
|
|
* Input:
|
|
* p is on the opening '<'
|
|
* Returns:
|
|
* 0 if not start of a comment
|
|
* 1 if start of a comment, p is adjusted to point past --
|
|
*/
|
|
|
|
int Html::isCommentStart()
|
|
#ifdef __DMC__
|
|
__out(result)
|
|
{
|
|
if (result == 0)
|
|
;
|
|
else if (result == 1)
|
|
{
|
|
assert(p[-2] == '-' && p[-1] == '-');
|
|
}
|
|
else
|
|
assert(0);
|
|
}
|
|
__body
|
|
#endif /* __DMC__ */
|
|
{ unsigned char *s;
|
|
|
|
if (p[0] == '<' && p[1] == '!')
|
|
{
|
|
for (s = p + 2; 1; s++)
|
|
{
|
|
switch (*s)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
case '\f':
|
|
case '\v':
|
|
// skip white space, even though spec says no
|
|
// white space is allowed
|
|
continue;
|
|
|
|
case '-':
|
|
if (s[1] == '-')
|
|
{
|
|
p = s + 2;
|
|
return 1;
|
|
}
|
|
goto No;
|
|
|
|
default:
|
|
goto No;
|
|
}
|
|
}
|
|
}
|
|
No:
|
|
return 0;
|
|
}
|
|
|
|
int Html::isCDATAStart()
|
|
{
|
|
const char * CDATA_START_MARKER = "<![CDATA[";
|
|
size_t len = strlen(CDATA_START_MARKER);
|
|
|
|
if (strncmp((char*)p, CDATA_START_MARKER, len) == 0)
|
|
{
|
|
p += len;
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
void Html::scanCDATA()
|
|
{
|
|
while(*p && *p != 0x1A)
|
|
{
|
|
int lineSepLength = isLineSeparator(p);
|
|
if (lineSepLength>0)
|
|
{
|
|
/* Always extract new lines, so that D lexer counts the lines
|
|
* right.
|
|
*/
|
|
linnum++;
|
|
dbuf->writeUTF8('\n');
|
|
p += lineSepLength;
|
|
continue;
|
|
}
|
|
else if (p[0] == ']' && p[1] == ']' && p[2] == '>')
|
|
{
|
|
/* end of CDATA section */
|
|
p += 3;
|
|
return;
|
|
}
|
|
else if (inCode)
|
|
{
|
|
/* this CDATA section contains D code */
|
|
dbuf->writeByte(*p);
|
|
}
|
|
|
|
p++;
|
|
}
|
|
}
|
|
|
|
/********************************************
|
|
* Convert an HTML character entity into a character.
|
|
* Forms are:
|
|
* &name; named entity
|
|
* &#ddd; decimal
|
|
* &#xhhhh; hex
|
|
* Input:
|
|
* p is on the &
|
|
*/
|
|
|
|
int Html::charEntity()
|
|
{ int c = 0;
|
|
int v;
|
|
int hex;
|
|
unsigned char *pstart = p;
|
|
|
|
//printf("Html::charEntity('%c')\n", *p);
|
|
if (p[1] == '#')
|
|
{
|
|
p++;
|
|
if (p[1] == 'x' || p[1] == 'X')
|
|
{ p++;
|
|
hex = 1;
|
|
}
|
|
else
|
|
hex = 0;
|
|
if (p[1] == ';')
|
|
goto Linvalid;
|
|
while (1)
|
|
{
|
|
p++;
|
|
switch (*p)
|
|
{
|
|
case 0:
|
|
case 0x1a:
|
|
error("end of file before end of character entity");
|
|
goto Lignore;
|
|
|
|
case '\n':
|
|
case '\r':
|
|
case '<': // tag start
|
|
// Termination is assumed
|
|
break;
|
|
|
|
case ';':
|
|
// Termination is explicit
|
|
p++;
|
|
break;
|
|
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
v = *p - '0';
|
|
goto Lvalue;
|
|
|
|
case 'a': case 'b': case 'c':
|
|
case 'd': case 'e': case 'f':
|
|
if (!hex)
|
|
goto Linvalid;
|
|
v = (*p - 'a') + 10;
|
|
goto Lvalue;
|
|
|
|
case 'A': case 'B': case 'C':
|
|
case 'D': case 'E': case 'F':
|
|
if (!hex)
|
|
goto Linvalid;
|
|
v = (*p - 'A') + 10;
|
|
goto Lvalue;
|
|
|
|
Lvalue:
|
|
if (hex)
|
|
c = (c << 4) + v;
|
|
else
|
|
c = (c * 10) + v;
|
|
if (c > 0x10FFFF)
|
|
{
|
|
error("character entity out of range");
|
|
goto Lignore;
|
|
}
|
|
continue;
|
|
|
|
default:
|
|
Linvalid:
|
|
error("invalid numeric character reference");
|
|
goto Lignore;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// It's a named entity; gather all characters until ;
|
|
unsigned char *idstart = p + 1;
|
|
|
|
while (1)
|
|
{
|
|
p++;
|
|
switch (*p)
|
|
{
|
|
case 0:
|
|
case 0x1a:
|
|
error("end of file before end of character entity");
|
|
break;
|
|
|
|
case '\n':
|
|
case '\r':
|
|
case '<': // tag start
|
|
// Termination is assumed
|
|
c = HtmlNamedEntity(idstart, p - idstart);
|
|
if (c == -1)
|
|
goto Lignore;
|
|
break;
|
|
|
|
case ';':
|
|
// Termination is explicit
|
|
c = HtmlNamedEntity(idstart, p - idstart);
|
|
if (c == -1)
|
|
goto Lignore;
|
|
p++;
|
|
break;
|
|
|
|
default:
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Kludge to convert non-breaking space to ascii space
|
|
if (c == 160)
|
|
c = ' ';
|
|
|
|
return c;
|
|
|
|
Lignore:
|
|
//printf("Lignore\n");
|
|
p = pstart + 1;
|
|
return '&';
|
|
}
|
|
|
|
/**
|
|
* identify DOS, Linux, Mac, Next and Unicode line endings
|
|
* 0 if this is no line separator
|
|
* >0 the length of the separator
|
|
* Note: input has to be UTF-8
|
|
*/
|
|
static int isLineSeparator(const unsigned char* p)
|
|
{
|
|
// Linux
|
|
if( p[0]=='\n')
|
|
return 1;
|
|
|
|
// Mac & Dos
|
|
if( p[0]=='\r')
|
|
return (p[1]=='\n') ? 2 : 1;
|
|
|
|
// Unicode (line || paragraph sep.)
|
|
if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9))
|
|
return 3;
|
|
|
|
// Next
|
|
if( p[0]==0xC2 && p[1]==0x85)
|
|
return 2;
|
|
|
|
return 0;
|
|
}
|
|
|