00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
00020 #define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
00021
00022 #include <vector>
00023 #include "net/instaweb/htmlparse/public/html_name.h"
00024 #include "net/instaweb/htmlparse/public/html_parser_types.h"
00025 #include "net/instaweb/htmlparse/public/doctype.h"
00026 #include "net/instaweb/htmlparse/public/html_element.h"
00027 #include "net/instaweb/http/public/content_type.h"
00028 #include "net/instaweb/util/public/basictypes.h"
00029 #include "net/instaweb/util/public/printf_format.h"
00030 #include "net/instaweb/util/public/string.h"
00031 #include "net/instaweb/util/public/string_util.h"
00032
00033 namespace net_instaweb {
00034
00035 class HtmlParse;
00036
00045 class HtmlLexer {
00046 public:
00047 explicit HtmlLexer(HtmlParse* html_parse);
00048 ~HtmlLexer();
00049
00051 void StartParse(const StringPiece& id, const ContentType& content_type);
00052
00055 void Parse(const char* text, int size);
00056
00058 void FinishParse();
00059
00061 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
00062
00064 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
00065
00067 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
00068
00070 void DebugPrintStack();
00071
00073 HtmlElement* Parent() const;
00074
00077 const DocType& doctype() const { return doctype_; }
00078
00079 private:
00081 inline void EvalStart(char c);
00082 inline void EvalTag(char c);
00083 inline void EvalTagOpen(char c);
00084 inline void EvalTagClose(char c);
00085 inline void EvalTagCloseTerminate(char c);
00086 inline void EvalTagBriefClose(char c);
00087 inline void EvalTagBriefCloseAttr(char c);
00088 inline void EvalCommentStart1(char c);
00089 inline void EvalCommentStart2(char c);
00090 inline void EvalCommentBody(char c);
00091 inline void EvalCommentEnd1(char c);
00092 inline void EvalCommentEnd2(char c);
00093 inline void EvalCdataStart1(char c);
00094 inline void EvalCdataStart2(char c);
00095 inline void EvalCdataStart3(char c);
00096 inline void EvalCdataStart4(char c);
00097 inline void EvalCdataStart5(char c);
00098 inline void EvalCdataStart6(char c);
00099 inline void EvalCdataBody(char c);
00100 inline void EvalCdataEnd1(char c);
00101 inline void EvalCdataEnd2(char c);
00102 inline void EvalAttribute(char c);
00103 inline void EvalAttrName(char c);
00104 inline void EvalAttrEq(char c);
00105 inline void EvalAttrVal(char c);
00106 inline void EvalAttrValSq(char c);
00107 inline void EvalAttrValDq(char c);
00108 inline void EvalLiteralTag(char c);
00109 inline void EvalDirective(char c);
00110
00113 void MakeElement();
00114
00115 void MakeAttribute(bool has_value);
00116 void FinishAttribute(char c, bool has_value, bool brief_close);
00117
00118 void EmitCdata();
00119 void EmitComment();
00120 void EmitLiteral();
00121 void EmitTagOpen(bool allow_implicit_close);
00122 void EmitTagClose(HtmlElement::CloseStyle close_style);
00123 void EmitTagBriefClose();
00124 void EmitDirective();
00125 void Restart(char c);
00126
00128 void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00129
00140 HtmlElement* PopElementMatchingTag(const StringPiece& tag);
00141
00142 HtmlElement* PopElement();
00143 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
00144 int line_nubmer);
00145
00149 static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }
00150
00152 static inline bool IsLegalTagFirstChar(char c);
00154 static inline bool IsLegalTagChar(char c);
00155
00157 static inline bool IsLegalAttrNameChar(char c);
00158
00165 enum State {
00166 START,
00167 TAG,
00168 TAG_CLOSE,
00169 TAG_CLOSE_TERMINATE,
00170 TAG_OPEN,
00171 TAG_BRIEF_CLOSE,
00172 TAG_BRIEF_CLOSE_ATTR,
00173 COMMENT_START1,
00174 COMMENT_START2,
00175 COMMENT_BODY,
00176 COMMENT_END1,
00177 COMMENT_END2,
00178 CDATA_START1,
00179 CDATA_START2,
00180 CDATA_START3,
00181 CDATA_START4,
00182 CDATA_START5,
00183 CDATA_START6,
00184 CDATA_BODY,
00185 CDATA_END1,
00186 CDATA_END2,
00187 TAG_ATTRIBUTE,
00188 TAG_ATTR_NAME,
00189 TAG_ATTR_NAME_SPACE,
00190 TAG_ATTR_EQ,
00191 TAG_ATTR_VAL,
00192 TAG_ATTR_VALDQ,
00193 TAG_ATTR_VALSQ,
00194 LITERAL_TAG,
00195 DIRECTIVE
00196 };
00197
00198 HtmlParse* html_parse_;
00199 State state_;
00200 GoogleString token_;
00201 GoogleString literal_;
00202 GoogleString attr_name_;
00203 GoogleString attr_value_;
00204 HtmlElement::QuoteStyle attr_quote_;
00205 bool has_attr_value_;
00206 HtmlElement* element_;
00207 int line_;
00208 int tag_start_line_;
00209 GoogleString id_;
00210 GoogleString literal_close_;
00211
00212 ContentType content_type_;
00213 DocType doctype_;
00214
00215 std::vector<HtmlElement*> element_stack_;
00216
00217 DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
00218 };
00219
00220 }
00221
00222 #endif ///< NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_