// Copyright (c) 1999-2006 by Digital Mars // All Rights Reserved // written by Walter Bright // http://www.digitalmars.com // License for redistribution is by either the Artistic License // in artistic.txt, or the GNU General Public License in gnu.txt. // See the included readme.txt for details. /* HTML parser */ #include #include #include #include #include #include #include "mars.h" #include "html.h" #include #include "root.h" extern int HtmlNamedEntity(unsigned char *p, int length); static int isLineSeparator(const unsigned char* p); /********************************** * Determine if beginning of tag identifier * or a continuation of a tag identifier. */ inline int istagstart(int c) { return (isalpha(c) || c == '_'); } inline int istag(int c) { return (isalnum(c) || c == '_'); } /********************************************** */ Html::Html(const char *sourcename, unsigned char *base, unsigned length) { //printf("Html::Html()\n"); this->sourcename = sourcename; this->base = base; p = base; end = base + length; linnum = 1; dbuf = NULL; inCode = 0; } /********************************************** * Print error & quit. */ void Html::error(const char *format, ...) { if (!global.gag) { printf("%s(%d) : HTML Error: ", sourcename, linnum); va_list ap; va_start(ap, format); vprintf(format, ap); va_end(ap); printf("\n"); fflush(stdout); } global.errors++; } /********************************************** * Extract all the code from an HTML file, * concatenate it all together, and store in buf. */ void Html::extractCode(OutBuffer *buf) { //printf("Html::extractCode()\n"); dbuf = buf; // save for other routines buf->reserve(end - p); inCode = 0; while (1) { //printf("p = %p, *p = x%x\n", p, *p); switch (*p) { #if 0 // strings are not recognized outside of tags case '"': case '\'': skipString(); continue; #endif case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with * Netscape: comments nest * w3c: whitespace can appear between -- and > of comment close */ void Html::scanComment() { // Most of the complexity is dealing with the case that // an arbitrary amount of whitespace can appear between // the -- and the > of a comment close. int scangt = 0; //printf("scanComment()\n"); if (*p == '\n') { linnum++; // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); } while (1) { //scangt = 1; // IE 5.0 compatibility p++; switch (*p) { case '-': if (p[1] == '-') { if (p[2] == '>') // optimize for most common case { p += 3; break; } p++; scangt = 1; } else scangt = 0; continue; case '>': if (scangt) { // found --> p++; break; } continue; case ' ': case '\t': case '\f': case '\v': // skip white space continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // remember to count lines // Always extract new lines, so that D lexer counts the // lines right. dbuf->writeByte(*p); continue; case 0: case 0x1a: error("end of file before closing --> of comment"); break; default: Ldefault: scangt = 0; // it's not --> continue; } break; } //printf("*p = '%c'\n", *p); } /******************************************** * Determine if we are at the start of a comment. * Input: * p is on the opening '<' * Returns: * 0 if not start of a comment * 1 if start of a comment, p is adjusted to point past -- */ int Html::isCommentStart() #ifdef __DMC__ __out(result) { if (result == 0) ; else if (result == 1) { assert(p[-2] == '-' && p[-1] == '-'); } else assert(0); } __body #endif /* __DMC__ */ { unsigned char *s; if (p[0] == '<' && p[1] == '!') { for (s = p + 2; 1; s++) { switch (*s) { case ' ': case '\t': case '\r': case '\f': case '\v': // skip white space, even though spec says no // white space is allowed continue; case '-': if (s[1] == '-') { p = s + 2; return 1; } goto No; default: goto No; } } } No: return 0; } int Html::isCDATAStart() { const char * CDATA_START_MARKER = "0) { /* Always extract new lines, so that D lexer counts the lines * right. */ linnum++; dbuf->writeUTF8('\n'); p += lineSepLength; continue; } else if (p[0] == ']' && p[1] == ']' && p[2] == '>') { /* end of CDATA section */ p += 3; return; } else if (inCode) { /* this CDATA section contains D code */ dbuf->writeByte(*p); } p++; } } /******************************************** * Convert an HTML character entity into a character. * Forms are: * &name; named entity * &#ddd; decimal * &#xhhhh; hex * Input: * p is on the & */ int Html::charEntity() { int c = 0; int v; int hex; unsigned char *pstart = p; //printf("Html::charEntity('%c')\n", *p); if (p[1] == '#') { p++; if (p[1] == 'x' || p[1] == 'X') { p++; hex = 1; } else hex = 0; if (p[1] == ';') goto Linvalid; while (1) { p++; switch (*p) { case 0: case 0x1a: error("end of file before end of character entity"); goto Lignore; case '\n': case '\r': case '<': // tag start // Termination is assumed break; case ';': // Termination is explicit p++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': v = *p - '0'; goto Lvalue; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': if (!hex) goto Linvalid; v = (*p - 'a') + 10; goto Lvalue; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': if (!hex) goto Linvalid; v = (*p - 'A') + 10; goto Lvalue; Lvalue: if (hex) c = (c << 4) + v; else c = (c * 10) + v; if (c > 0x10FFFF) { error("character entity out of range"); goto Lignore; } continue; default: Linvalid: error("invalid numeric character reference"); goto Lignore; } break; } } else { // It's a named entity; gather all characters until ; unsigned char *idstart = p + 1; while (1) { p++; switch (*p) { case 0: case 0x1a: error("end of file before end of character entity"); break; case '\n': case '\r': case '<': // tag start // Termination is assumed c = HtmlNamedEntity(idstart, p - idstart); if (c == -1) goto Lignore; break; case ';': // Termination is explicit c = HtmlNamedEntity(idstart, p - idstart); if (c == -1) goto Lignore; p++; break; default: continue; } break; } } // Kludge to convert non-breaking space to ascii space if (c == 160) c = ' '; return c; Lignore: //printf("Lignore\n"); p = pstart + 1; return '&'; } /** * identify DOS, Linux, Mac, Next and Unicode line endings * 0 if this is no line separator * >0 the length of the separator * Note: input has to be UTF-8 */ static int isLineSeparator(const unsigned char* p) { // Linux if( p[0]=='\n') return 1; // Mac & Dos if( p[0]=='\r') return (p[1]=='\n') ? 2 : 1; // Unicode (line || paragraph sep.) if( p[0]==0xE2 && p[1]==0x80 && (p[2]==0xA8 || p[2]==0xA9)) return 3; // Next if( p[0]==0xC2 && p[1]==0x85) return 2; return 0; }