Page Speed Optimization Libraries  1.4.26.1
net/instaweb/htmlparse/html_lexer.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Google Inc.
00003  *
00004  * Licensed under the Apache License, Version 2.0 (the "License");
00005  * you may not use this file except in compliance with the License.
00006  * You may obtain a copy of the License at
00007  *
00008  *      http:///www.apache.org/licenses/LICENSE-2.0
00009  *
00010  * Unless required by applicable law or agreed to in writing, software
00011  * distributed under the License is distributed on an "AS IS" BASIS,
00012  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013  * See the License for the specific language governing permissions and
00014  * limitations under the License.
00015  */
00016 
00018 
00019 #ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
00020 #define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
00021 
00022 #include <vector>
00023 #include "net/instaweb/htmlparse/public/html_name.h"
00024 #include "net/instaweb/htmlparse/public/doctype.h"
00025 #include "net/instaweb/htmlparse/public/html_element.h"
00026 #include "net/instaweb/http/public/content_type.h"
00027 #include "net/instaweb/util/public/basictypes.h"
00028 #include "net/instaweb/util/public/printf_format.h"
00029 #include "net/instaweb/util/public/string.h"
00030 #include "net/instaweb/util/public/string_util.h"
00031 
00032 namespace net_instaweb {
00033 
00034 class HtmlParse;
00035 
00044 class HtmlLexer {
00045  public:
00046   explicit HtmlLexer(HtmlParse* html_parse);
00047   ~HtmlLexer();
00048 
00050   void StartParse(const StringPiece& id, const ContentType& content_type);
00051 
00054   void Parse(const char* text, int size);
00055 
00057   void FinishParse();
00058 
00060   bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
00061 
00065   static bool IsLiteralTag(HtmlName::Keyword keyword);
00066 
00071   static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);
00072 
00074   bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
00075 
00077   bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
00078 
00080   void DebugPrintStack();
00081 
00084   HtmlElement* Parent() const;
00085 
00088   const DocType& doctype() const { return doctype_; }
00089 
00091   void set_size_limit(int64 x) { size_limit_ = x; }
00092 
00095   bool size_limit_exceeded() const { return size_limit_exceeded_; }
00096 
00097  private:
00099   inline void EvalStart(char c);
00100   inline void EvalTag(char c);
00101   inline void EvalTagOpen(char c);
00102   inline void EvalTagClose(char c);
00103   inline void EvalTagCloseTerminate(char c);
00104   inline void EvalTagBriefClose(char c);
00105   inline void EvalTagBriefCloseAttr(char c);
00106   inline void EvalCommentStart1(char c);
00107   inline void EvalCommentStart2(char c);
00108   inline void EvalCommentBody(char c);
00109   inline void EvalCommentEnd1(char c);
00110   inline void EvalCommentEnd2(char c);
00111   inline void EvalCdataStart1(char c);
00112   inline void EvalCdataStart2(char c);
00113   inline void EvalCdataStart3(char c);
00114   inline void EvalCdataStart4(char c);
00115   inline void EvalCdataStart5(char c);
00116   inline void EvalCdataStart6(char c);
00117   inline void EvalCdataBody(char c);
00118   inline void EvalCdataEnd1(char c);
00119   inline void EvalCdataEnd2(char c);
00120   inline void EvalAttribute(char c);
00121   inline void EvalAttrName(char c);
00122   inline void EvalAttrEq(char c);
00123   inline void EvalAttrVal(char c);
00124   inline void EvalAttrValSq(char c);
00125   inline void EvalAttrValDq(char c);
00126   inline void EvalLiteralTag(char c);
00127   inline void EvalDirective(char c);
00128 
00131   void MakeElement();
00132 
00133   void MakeAttribute(bool has_value);
00134   void FinishAttribute(char c, bool has_value, bool brief_close);
00135 
00136   void EmitCdata();
00137   void EmitComment();
00138   void EmitLiteral();
00139   void EmitTagOpen(bool allow_implicit_close); 
00140   void EmitTagClose(HtmlElement::CloseStyle close_style);
00141   void EmitTagBriefClose();
00142   void EmitDirective();
00143   void Restart(char c);
00144 
00146   void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00147 
00158   HtmlElement* PopElementMatchingTag(const StringPiece& tag);
00159 
00160   HtmlElement* PopElement();
00161   void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style);
00162 
00166   static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }
00167 
00169   static inline bool IsLegalTagFirstChar(char c);
00171   static inline bool IsLegalTagChar(char c);
00172 
00174   static inline bool IsLegalAttrNameChar(char c);
00175 
00182   enum State {
00183     START,
00184     TAG,                   
00185     TAG_CLOSE,             
00186     TAG_CLOSE_TERMINATE,   
00187     TAG_OPEN,              
00188     TAG_BRIEF_CLOSE,       
00189     TAG_BRIEF_CLOSE_ATTR,  
00190     COMMENT_START1,        
00191     COMMENT_START2,        
00192     COMMENT_BODY,          
00193     COMMENT_END1,          
00194     COMMENT_END2,          
00195     CDATA_START1,          
00196     CDATA_START2,          
00197     CDATA_START3,          
00198     CDATA_START4,          
00199     CDATA_START5,          
00200     CDATA_START6,          
00201     CDATA_BODY,            
00202     CDATA_END1,            
00203     CDATA_END2,            
00204     TAG_ATTRIBUTE,         
00205     TAG_ATTR_NAME,         
00206     TAG_ATTR_NAME_SPACE,   
00207     TAG_ATTR_EQ,           
00208     TAG_ATTR_VAL,          
00209     TAG_ATTR_VALDQ,        
00210     TAG_ATTR_VALSQ,        
00211     LITERAL_TAG,           
00212     DIRECTIVE              
00213   };
00214 
00215   HtmlParse* html_parse_;
00216   State state_;
00217   GoogleString token_; 
00218   GoogleString literal_; 
00219   GoogleString attr_name_; 
00220   GoogleString attr_value_; 
00221   HtmlElement::QuoteStyle attr_quote_; 
00222   bool has_attr_value_; 
00223   HtmlElement* element_; 
00224   int line_;
00225   int tag_start_line_; 
00226   GoogleString id_;
00227   GoogleString literal_close_; 
00228 
00229   ContentType content_type_;
00230   DocType doctype_;
00231 
00232   std::vector<HtmlElement*> element_stack_;
00233 
00236   bool size_limit_exceeded_;
00239   bool skip_parsing_;
00240   int64 num_bytes_parsed_;
00241   int64 size_limit_;
00242 
00243   DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
00244 };
00245 
00246 }  
00247 
00248 #endif  ///< NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines