Page Speed Optimization Libraries
1.4.26.1
|
00001 /* 00002 * Copyright 2010 Google Inc. 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * http:///www.apache.org/licenses/LICENSE-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 00018 00019 #ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_ 00020 #define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_ 00021 00022 #include <vector> 00023 #include "net/instaweb/htmlparse/public/html_name.h" 00024 #include "net/instaweb/htmlparse/public/doctype.h" 00025 #include "net/instaweb/htmlparse/public/html_element.h" 00026 #include "net/instaweb/http/public/content_type.h" 00027 #include "net/instaweb/util/public/basictypes.h" 00028 #include "net/instaweb/util/public/printf_format.h" 00029 #include "net/instaweb/util/public/string.h" 00030 #include "net/instaweb/util/public/string_util.h" 00031 00032 namespace net_instaweb { 00033 00034 class HtmlParse; 00035 00044 class HtmlLexer { 00045 public: 00046 explicit HtmlLexer(HtmlParse* html_parse); 00047 ~HtmlLexer(); 00048 00050 void StartParse(const StringPiece& id, const ContentType& content_type); 00051 00054 void Parse(const char* text, int size); 00055 00057 void FinishParse(); 00058 00060 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; 00061 00065 static bool IsLiteralTag(HtmlName::Keyword keyword); 00066 00071 static bool IsSometimesLiteralTag(HtmlName::Keyword keyword); 00072 00074 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; 00075 00077 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; 00078 00080 void DebugPrintStack(); 00081 00084 HtmlElement* Parent() const; 00085 00088 const DocType& doctype() const { return doctype_; } 00089 00091 void set_size_limit(int64 x) { size_limit_ = x; } 00092 00095 bool size_limit_exceeded() const { return size_limit_exceeded_; } 00096 00097 private: 00099 inline void EvalStart(char c); 00100 inline void EvalTag(char c); 00101 inline void EvalTagOpen(char c); 00102 inline void EvalTagClose(char c); 00103 inline void EvalTagCloseTerminate(char c); 00104 inline void EvalTagBriefClose(char c); 00105 inline void EvalTagBriefCloseAttr(char c); 00106 inline void EvalCommentStart1(char c); 00107 inline void EvalCommentStart2(char c); 00108 inline void EvalCommentBody(char c); 00109 inline void EvalCommentEnd1(char c); 00110 inline void EvalCommentEnd2(char c); 00111 inline void EvalCdataStart1(char c); 00112 inline void EvalCdataStart2(char c); 00113 inline void EvalCdataStart3(char c); 00114 inline void EvalCdataStart4(char c); 00115 inline void EvalCdataStart5(char c); 00116 inline void EvalCdataStart6(char c); 00117 inline void EvalCdataBody(char c); 00118 inline void EvalCdataEnd1(char c); 00119 inline void EvalCdataEnd2(char c); 00120 inline void EvalAttribute(char c); 00121 inline void EvalAttrName(char c); 00122 inline void EvalAttrEq(char c); 00123 inline void EvalAttrVal(char c); 00124 inline void EvalAttrValSq(char c); 00125 inline void EvalAttrValDq(char c); 00126 inline void EvalLiteralTag(char c); 00127 inline void EvalDirective(char c); 00128 00131 void MakeElement(); 00132 00133 void MakeAttribute(bool has_value); 00134 void FinishAttribute(char c, bool has_value, bool brief_close); 00135 00136 void EmitCdata(); 00137 void EmitComment(); 00138 void EmitLiteral(); 00139 void EmitTagOpen(bool allow_implicit_close); 00140 void EmitTagClose(HtmlElement::CloseStyle close_style); 00141 void EmitTagBriefClose(); 00142 void EmitDirective(); 00143 void Restart(char c); 00144 00146 void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00147 00158 HtmlElement* PopElementMatchingTag(const StringPiece& tag); 00159 00160 HtmlElement* PopElement(); 00161 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style); 00162 00166 static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); } 00167 00169 static inline bool IsLegalTagFirstChar(char c); 00171 static inline bool IsLegalTagChar(char c); 00172 00174 static inline bool IsLegalAttrNameChar(char c); 00175 00182 enum State { 00183 START, 00184 TAG, 00185 TAG_CLOSE, 00186 TAG_CLOSE_TERMINATE, 00187 TAG_OPEN, 00188 TAG_BRIEF_CLOSE, 00189 TAG_BRIEF_CLOSE_ATTR, 00190 COMMENT_START1, 00191 COMMENT_START2, 00192 COMMENT_BODY, 00193 COMMENT_END1, 00194 COMMENT_END2, 00195 CDATA_START1, 00196 CDATA_START2, 00197 CDATA_START3, 00198 CDATA_START4, 00199 CDATA_START5, 00200 CDATA_START6, 00201 CDATA_BODY, 00202 CDATA_END1, 00203 CDATA_END2, 00204 TAG_ATTRIBUTE, 00205 TAG_ATTR_NAME, 00206 TAG_ATTR_NAME_SPACE, 00207 TAG_ATTR_EQ, 00208 TAG_ATTR_VAL, 00209 TAG_ATTR_VALDQ, 00210 TAG_ATTR_VALSQ, 00211 LITERAL_TAG, 00212 DIRECTIVE 00213 }; 00214 00215 HtmlParse* html_parse_; 00216 State state_; 00217 GoogleString token_; 00218 GoogleString literal_; 00219 GoogleString attr_name_; 00220 GoogleString attr_value_; 00221 HtmlElement::QuoteStyle attr_quote_; 00222 bool has_attr_value_; 00223 HtmlElement* element_; 00224 int line_; 00225 int tag_start_line_; 00226 GoogleString id_; 00227 GoogleString literal_close_; 00228 00229 ContentType content_type_; 00230 DocType doctype_; 00231 00232 std::vector<HtmlElement*> element_stack_; 00233 00236 bool size_limit_exceeded_; 00239 bool skip_parsing_; 00240 int64 num_bytes_parsed_; 00241 int64 size_limit_; 00242 00243 DISALLOW_COPY_AND_ASSIGN(HtmlLexer); 00244 }; 00245 00246 } 00247 00248 #endif ///< NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_