+// Copyright (C) 2006 Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+ * @fileoverview
+ * some functions for browser-side pretty printing of code contained in html.
+ *
+ * The lexer should work on a number of languages including C and friends,
+ * Java, Python, Bash, SQL, HTML, XML, CSS, Javascript, and Makefiles.
+ * It works passably on Ruby, PHP and Awk and a decent subset of Perl, but,
+ * because of commenting conventions, doesn't work on Smalltalk, Lisp-like, or
+ * CAML-like languages.
+ *
+ * If there's a language not mentioned here, then I don't know it, and don't
+ * know whether it works. If it has a C-like, Bash-like, or XML-like syntax
+ * then it should work passably.
+ *
+ * Usage:
+ * 1) include this source file in an html page via
+ *
+ * 2) define style rules. See the example page for examples.
+ * 3) mark the
and tags in your source with class=prettyprint.
+ * You can also use the (html deprecated) tag, but the pretty printer
+ * needs to do more substantial DOM manipulations to support that, so some
+ * css styles may not be preserved.
+ * That's it. I wanted to keep the API as simple as possible, so there's no
+ * need to specify which language the code is in.
+ *
+ * Change log:
+ * cbeust, 2006/08/22
+ * Java annotations (start with "@") are now captured as literals ("lit")
+ */
+var PR_keywords = new Object();
+/** initialize the keyword list for our target languages. */
+(function () {
+ var CPP_KEYWORDS = (
+ "bool break case catch char class const const_cast continue default " +
+ "delete deprecated dllexport dllimport do double dynamic_cast else enum " +
+ "explicit extern false float for friend goto if inline int long mutable " +
+ "naked namespace new noinline noreturn nothrow novtable operator private " +
+ "property protected public register reinterpret_cast return selectany " +
+ "short signed sizeof static static_cast struct switch template this " +
+ "thread throw true try typedef typeid typename union unsigned using " +
+ "declaration, using directive uuid virtual void volatile while typeof");
+ "abstract default goto package synchronized boolean do if private this " +
+ "break double implements protected throw byte else import public throws " +
+ "case enum instanceof return transient catch extends int short try char " +
+ "final interface static void class finally long strictfp volatile const " +
+ "float native super while continue for new switch");
+ "and assert break class continue def del elif else except exec finally " +
+ "for from global if import in is lambda not or pass print raise return " +
+ "try while yield False True None");
+ "abstract boolean break byte case catch char class const continue " +
+ "debugger default delete do double else enum export extends false final " +
+ "finally float for function goto if implements import in instanceof int " +
+ "interface long native new null package private protected public return " +
+ "short static super switch synchronized this throw throws transient " +
+ "true try typeof var void volatile while with NaN Infinity");
+ "foreach require sub unless until use elsif BEGIN END");
+ var SH_KEYWORDS = (
+ "if then do done else fi end");
+ "if then elsif else end begin do rescue ensure while for class module " +
+ "def yield raise until unless and or not when case super undef break " +
+ "next redo retry in return alias defined");
+ for (var k = 0; k < KEYWORDS.length; k++) {
+ var kw = KEYWORDS[k].split(' ');
+ for (var i = 0; i < kw.length; i++) {
+ if (kw[i]) { PR_keywords[kw[i]] = true; }
+ }
+ }
+// token style names. correspond to css classes
+/** token style for a string literal */
+var PR_STRING = 'str';
+/** token style for a keyword */
+var PR_KEYWORD = 'kwd';
+/** token style for a comment */
+var PR_COMMENT = 'com';
+/** token style for a type */
+var PR_TYPE = 'typ';
+/** token style for a literal value. e.g. 1, null, true. */
+var PR_LITERAL = 'lit';
+/** token style for a punctuation string. */
+var PR_PUNCTUATION = 'pun';
+/** token style for a punctuation string. */
+var PR_PLAIN = 'pln';
+/** token style for an sgml tag. */
+var PR_TAG = 'tag';
+/** token style for a markup declaration such as a DOCTYPE. */
+var PR_DECLARATION = 'dec';
+/** token style for embedded source. */
+var PR_SOURCE = 'src';
+/** token style for an sgml attribute name. */
+var PR_ATTRIB_NAME = 'atn';
+/** token style for an sgml attribute value. */
+var PR_ATTRIB_VALUE = 'atv';
+/** the position of the end of a token during. A division of a string into
+ * n tokens can be represented as a series n - 1 token ends, as long as
+ * runs of whitespace warrant their own token.
+ * @private
+ */
+function PR_TokenEnd(end, style) {
+ if (undefined === style) { throw new Error('BAD'); }
+ if ('number' != typeof(end)) { throw new Error('BAD'); }
+ this.end = end;
+ = style;
+PR_TokenEnd.prototype.toString = function () {
+ return '[PR_TokenEnd ' + this.end +
+ ( ? ':' + : '') + ']';
+/** a chunk of text with a style. These are used to represent both the output
+ * from the lexing functions as well as intermediate results.
+ * @constructor
+ * @param token the token text
+ * @param style one of the token styles defined in designdoc-template, or null
+ * for a styleless token, such as an embedded html tag.
+ * @private
+ */
+function PR_Token(token, style) {
+ if (undefined === style) { throw new Error('BAD'); }
+ this.token = token;
+ = style;
+PR_Token.prototype.toString = function () {
+ return '[PR_Token ' + this.token + ( ? ':' + : '') + ']';
+/** a helper class that decodes common html entities used to escape source and
+ * markup punctuation characters in html.
+ * @constructor
+ * @private
+ */
+function PR_DecodeHelper() {
+ = 0;
+ = '\0';
+PR_DecodeHelper.prototype.decode = function (s, i) {
+ var next = i + 1;
+ var ch = s.charAt(i);
+ if ('&' == ch) {
+ var semi = s.indexOf(';', next);
+ if (semi >= 0 && semi < next + 4) {
+ var entityName = s.substring(next, semi).toLowerCase();
+ next = semi + 1;
+ if ('lt' == entityName) {
+ ch = '<';
+ } else if ('gt' == entityName) {
+ ch = '>';
+ } else if ('quot' == entityName) {
+ ch = '"';
+ } else if ('apos' == entityName) {
+ ch = '\'';
+ } else if ('amp' == entityName) {
+ ch = '&';
+ } else {
+ next = i + 1;
+ }
+ }
+ }
+ = next;
+ = ch;
+ return;
+// some string utilities
+function PR_isWordChar(ch) {
+ return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
+function PR_isIdentifierStart(ch) {
+ return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@';
+function PR_isIdentifierPart(ch) {
+ return PR_isIdentifierStart(ch) || PR_isDigitChar(ch);
+function PR_isSpaceChar(ch) {
+ return "\t \r\n".indexOf(ch) >= 0;
+function PR_isDigitChar(ch) {
+ return ch >= '0' && ch <= '9';
+function PR_trim(s) {
+ var i = 0, j = s.length - 1;
+ while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; }
+ while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; }
+ return s.substring(i, j + 1);
+function PR_startsWith(s, prefix) {
+ return s.length >= prefix.length && prefix == s.substring(0, prefix.length);
+function PR_endsWith(s, suffix) {
+ return s.length >= suffix.length &&
+ suffix == s.substring(s.length - suffix.length, s.length);
+/** true iff prefix matches the first prefix characters in chars[0:len].
+ * @private
+ */
+function PR_prefixMatch(chars, len, prefix) {
+ if (len < prefix.length) { return false; }
+ for (var i = 0, n = prefix.length; i < n; ++i) {
+ if (prefix.charAt(i) != chars[i]) { return false; }
+ }
+ return true;
+/** like textToHtml but escapes double quotes to be attribute safe. */
+function PR_attribToHtml(str) {
+ return str.replace(/&/g, '&')
+ .replace(//g, '>')
+ .replace(/\"/g, '"')
+ .replace(/\xa0/, ' ');
+/** escapest html special characters to html. */
+function PR_textToHtml(str) {
+ return str.replace(/&/g, '&')
+ .replace(//g, '>')
+ .replace(/\xa0/g, ' ');
+/** is the given node's innerHTML normally unescaped? */
+function PR_isRawContent(node) {
+ return 'XMP' == node.tagName;
+var PR_innerHtmlWorks = null;
+function PR_getInnerHtml(node) {
+ // inner html is hopelessly broken in Safari 2.0.4 when the content is
+ // an html description of well formed XML and the containing tag is a PRE
+ // tag, so we detect that case and emulate innerHTML.
+ if (null == PR_innerHtmlWorks) {
+ var testNode = document.createElement('PRE');
+ testNode.appendChild(
+ document.createTextNode('\n'));
+ PR_innerHtmlWorks = !/');
+ for (var child = node.firstChild; child; child = child.nextSibling) {
+ PR_normalizedHtml(child, out);
+ }
+ if (node.firstChild || !/^(?:br|link|img)$/.test(name)) {
+ out.push('<\/', name, '>');
+ }
+ break;
+ case 2: // an attribute
+ out.push(, '="', PR_attribToHtml(node.value), '"');
+ break;
+ case 3: case 4: // text
+ out.push(PR_textToHtml(node.nodeValue));
+ break;
+ }
+/** split markup into chunks of html tags (style null) and
+ * plain text (style {@link #PR_PLAIN}).
+ *
+ * @param s a String of html.
+ * @return an Array of PR_Tokens of style PR_PLAIN and null.
+ * @private
+ */
+function PR_chunkify(s) {
+ var chunks = new Array();
+ var state = 0;
+ var start = 0;
+ var pos = -1;
+ for (var i = 0, n = s.length; i < n; ++i) {
+ var ch = s.charAt(i);
+ switch (state) {
+ case 0:
+ if ('<' == ch) { state = 1; }
+ break;
+ case 1:
+ pos = i - 1;
+ if ('/' == ch) { state = 2; }
+ else if (PR_isWordChar(ch)) { state = 3; }
+ else if ('<' == ch) { state = 1; }
+ else { state = 0; }
+ break;
+ case 2:
+ if (PR_isWordChar(ch)) { state = 3; }
+ else if ('<' == ch) { state = 1; }
+ else { state = 0; }
+ break;
+ case 3:
+ if ('>' == ch) {
+ if (pos > start) {
+ chunks.push(new PR_Token(s.substring(start, pos), PR_PLAIN));
+ }
+ chunks.push(new PR_Token(s.substring(pos, i + 1), null));
+ start = i + 1;
+ pos = -1;
+ state = 0;
+ }
+ break;
+ }
+ }
+ if (s.length > start) {
+ chunks.push(new PR_Token(s.substring(start, s.length), PR_PLAIN));
+ }
+ return chunks;
+/** splits chunks around entities.
+ * @private
+ */
+function PR_splitEntities(chunks) {
+ var chunksOut = new Array();
+ var state = 0;
+ for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
+ var chunk = chunks[ci];
+ if (PR_PLAIN != {
+ chunksOut.push(chunk);
+ continue;
+ }
+ var s = chunk.token;
+ var pos = 0;
+ var start;
+ for (var i = 0; i < s.length; ++i) {
+ var ch = s.charAt(i);
+ switch (state) {
+ case 0:
+ if ('&' == ch) { state = 1; }
+ break;
+ case 1:
+ if ('#' == ch || PR_isWordChar(ch)) {
+ start = i - 1;
+ state = 2;
+ } else {
+ state = 0;
+ }
+ break;
+ case 2:
+ if (';' == ch) {
+ if (start > pos) {
+ chunksOut.push(
+ new PR_Token(s.substring(pos, start),;
+ }
+ chunksOut.push(new PR_Token(s.substring(start, i + 1), null));
+ pos = i + 1;
+ state = 0;
+ }
+ break;
+ }
+ }
+ if (s.length > pos) {
+ chunksOut.push(pos ?
+ new PR_Token(s.substring(pos, s.length), :
+ chunk);
+ }
+ }
+ return chunksOut;
+/** walk the tokenEnds list and the chunk list in parallel to generate a list
+ * of split tokens.
+ * @private
+ */
+function PR_splitChunks(chunks, tokenEnds) {
+ var tokens = new Array(); // the output
+ var ci = 0; // index into chunks
+ // position of beginning of amount written so far in absolute space.
+ var posAbs = 0;
+ // position of amount written so far in chunk space
+ var posChunk = 0;
+ // current chunk
+ var chunk = new PR_Token('', null);
+ for (var ei = 0, ne = tokenEnds.length; ei < ne; ++ei) {
+ var tokenEnd = tokenEnds[ei];
+ var end = tokenEnd.end;
+ var tokLen = end - posAbs;
+ var remainingInChunk = chunk.token.length - posChunk;
+ while (remainingInChunk <= tokLen) {
+ if (remainingInChunk > 0) {
+ tokens.push(
+ new PR_Token(chunk.token.substring(posChunk, chunk.token.length),
+ null == ? null :;
+ }
+ posAbs += remainingInChunk;
+ posChunk = 0;
+ if (ci < chunks.length) { chunk = chunks[ci++]; }
+ tokLen = end - posAbs;
+ remainingInChunk = chunk.token.length - posChunk;
+ }
+ if (tokLen) {
+ tokens.push(
+ new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen),
+ posAbs += tokLen;
+ posChunk += tokLen;
+ }
+ }
+ return tokens;
+/** splits markup tokens into declarations, tags, and source chunks.
+ * @private
+ */
+function PR_splitMarkup(chunks) {
+ // A state machine to split out declarations, tags, etc.
+ // This state machine deals with absolute space in the text, indexed by k,
+ // and position in the current chunk, indexed by pos and tokenStart to
+ // generate a list of the ends of tokens.
+ // Absolute space is calculated by considering the chunks as appended into
+ // one big string, as they were before being split.
+ // Known failure cases
+ // Server side scripting sections such as ...?> in attributes.
+ // i.e.
+ // Handling this would require a stack, and we don't use PHP.
+ // The output: a list of pairs of PR_TokenEnd instances
+ var tokenEnds = new Array();
+ var state = 0; // FSM state variable
+ var k = 0; // position in absolute space of the start of the current chunk
+ var tokenStart = -1; // the start of the current token
+ // Try to find a closing tag for any open