Page Speed Optimization Libraries
1.3.25.1
|
00001 /* 00002 * Copyright 2010 Google Inc. 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * http:///www.apache.org/licenses/LICENSE-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 00018 00019 #ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_ 00020 #define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_ 00021 00022 #include <vector> 00023 #include "net/instaweb/htmlparse/public/html_name.h" 00024 #include "net/instaweb/htmlparse/public/doctype.h" 00025 #include "net/instaweb/htmlparse/public/html_element.h" 00026 #include "net/instaweb/http/public/content_type.h" 00027 #include "net/instaweb/util/public/basictypes.h" 00028 #include "net/instaweb/util/public/printf_format.h" 00029 #include "net/instaweb/util/public/string.h" 00030 #include "net/instaweb/util/public/string_util.h" 00031 00032 namespace net_instaweb { 00033 00034 class HtmlParse; 00035 00044 class HtmlLexer { 00045 public: 00046 explicit HtmlLexer(HtmlParse* html_parse); 00047 ~HtmlLexer(); 00048 00050 void StartParse(const StringPiece& id, const ContentType& content_type); 00051 00054 void Parse(const char* text, int size); 00055 00057 void FinishParse(); 00058 00060 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; 00061 00063 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; 00064 00066 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; 00067 00069 void DebugPrintStack(); 00070 00073 HtmlElement* Parent() const; 00074 00077 const DocType& doctype() const { return doctype_; } 00078 00080 void set_size_limit(int64 x) { size_limit_ = x; } 00081 00084 bool size_limit_exceeded() const { return size_limit_exceeded_; } 00085 00086 private: 00088 inline void EvalStart(char c); 00089 inline void EvalTag(char c); 00090 inline void EvalTagOpen(char c); 00091 inline void EvalTagClose(char c); 00092 inline void EvalTagCloseTerminate(char c); 00093 inline void EvalTagBriefClose(char c); 00094 inline void EvalTagBriefCloseAttr(char c); 00095 inline void EvalCommentStart1(char c); 00096 inline void EvalCommentStart2(char c); 00097 inline void EvalCommentBody(char c); 00098 inline void EvalCommentEnd1(char c); 00099 inline void EvalCommentEnd2(char c); 00100 inline void EvalCdataStart1(char c); 00101 inline void EvalCdataStart2(char c); 00102 inline void EvalCdataStart3(char c); 00103 inline void EvalCdataStart4(char c); 00104 inline void EvalCdataStart5(char c); 00105 inline void EvalCdataStart6(char c); 00106 inline void EvalCdataBody(char c); 00107 inline void EvalCdataEnd1(char c); 00108 inline void EvalCdataEnd2(char c); 00109 inline void EvalAttribute(char c); 00110 inline void EvalAttrName(char c); 00111 inline void EvalAttrEq(char c); 00112 inline void EvalAttrVal(char c); 00113 inline void EvalAttrValSq(char c); 00114 inline void EvalAttrValDq(char c); 00115 inline void EvalLiteralTag(char c); 00116 inline void EvalDirective(char c); 00117 00120 void MakeElement(); 00121 00122 void MakeAttribute(bool has_value); 00123 void FinishAttribute(char c, bool has_value, bool brief_close); 00124 00125 void EmitCdata(); 00126 void EmitComment(); 00127 void EmitLiteral(); 00128 void EmitTagOpen(bool allow_implicit_close); 00129 void EmitTagClose(HtmlElement::CloseStyle close_style); 00130 void EmitTagBriefClose(); 00131 void EmitDirective(); 00132 void Restart(char c); 00133 00135 void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00136 00147 HtmlElement* PopElementMatchingTag(const StringPiece& tag); 00148 00149 HtmlElement* PopElement(); 00150 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style); 00151 00155 static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); } 00156 00158 static inline bool IsLegalTagFirstChar(char c); 00160 static inline bool IsLegalTagChar(char c); 00161 00163 static inline bool IsLegalAttrNameChar(char c); 00164 00171 enum State { 00172 START, 00173 TAG, 00174 TAG_CLOSE, 00175 TAG_CLOSE_TERMINATE, 00176 TAG_OPEN, 00177 TAG_BRIEF_CLOSE, 00178 TAG_BRIEF_CLOSE_ATTR, 00179 COMMENT_START1, 00180 COMMENT_START2, 00181 COMMENT_BODY, 00182 COMMENT_END1, 00183 COMMENT_END2, 00184 CDATA_START1, 00185 CDATA_START2, 00186 CDATA_START3, 00187 CDATA_START4, 00188 CDATA_START5, 00189 CDATA_START6, 00190 CDATA_BODY, 00191 CDATA_END1, 00192 CDATA_END2, 00193 TAG_ATTRIBUTE, 00194 TAG_ATTR_NAME, 00195 TAG_ATTR_NAME_SPACE, 00196 TAG_ATTR_EQ, 00197 TAG_ATTR_VAL, 00198 TAG_ATTR_VALDQ, 00199 TAG_ATTR_VALSQ, 00200 LITERAL_TAG, 00201 DIRECTIVE 00202 }; 00203 00204 HtmlParse* html_parse_; 00205 State state_; 00206 GoogleString token_; 00207 GoogleString literal_; 00208 GoogleString attr_name_; 00209 GoogleString attr_value_; 00210 HtmlElement::QuoteStyle attr_quote_; 00211 bool has_attr_value_; 00212 HtmlElement* element_; 00213 int line_; 00214 int tag_start_line_; 00215 GoogleString id_; 00216 GoogleString literal_close_; 00217 00218 ContentType content_type_; 00219 DocType doctype_; 00220 00221 std::vector<HtmlElement*> element_stack_; 00222 00225 bool size_limit_exceeded_; 00228 bool skip_parsing_; 00229 int64 num_bytes_parsed_; 00230 int64 size_limit_; 00231 00232 DISALLOW_COPY_AND_ASSIGN(HtmlLexer); 00233 }; 00234 00235 } 00236 00237 #endif ///< NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_