Page Speed Optimization Libraries  1.3.25.1
net/instaweb/htmlparse/html_lexer.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Google Inc.
00003  *
00004  * Licensed under the Apache License, Version 2.0 (the "License");
00005  * you may not use this file except in compliance with the License.
00006  * You may obtain a copy of the License at
00007  *
00008  *      http:///www.apache.org/licenses/LICENSE-2.0
00009  *
00010  * Unless required by applicable law or agreed to in writing, software
00011  * distributed under the License is distributed on an "AS IS" BASIS,
00012  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013  * See the License for the specific language governing permissions and
00014  * limitations under the License.
00015  */
00016 
00018 
00019 #ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
00020 #define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
00021 
00022 #include <vector>
00023 #include "net/instaweb/htmlparse/public/html_name.h"
00024 #include "net/instaweb/htmlparse/public/doctype.h"
00025 #include "net/instaweb/htmlparse/public/html_element.h"
00026 #include "net/instaweb/http/public/content_type.h"
00027 #include "net/instaweb/util/public/basictypes.h"
00028 #include "net/instaweb/util/public/printf_format.h"
00029 #include "net/instaweb/util/public/string.h"
00030 #include "net/instaweb/util/public/string_util.h"
00031 
00032 namespace net_instaweb {
00033 
00034 class HtmlParse;
00035 
00044 class HtmlLexer {
00045  public:
00046   explicit HtmlLexer(HtmlParse* html_parse);
00047   ~HtmlLexer();
00048 
00050   void StartParse(const StringPiece& id, const ContentType& content_type);
00051 
00054   void Parse(const char* text, int size);
00055 
00057   void FinishParse();
00058 
00060   bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
00061 
00063   bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
00064 
00066   bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
00067 
00069   void DebugPrintStack();
00070 
00073   HtmlElement* Parent() const;
00074 
00077   const DocType& doctype() const { return doctype_; }
00078 
00080   void set_size_limit(int64 x) { size_limit_ = x; }
00081 
00084   bool size_limit_exceeded() const { return size_limit_exceeded_; }
00085 
00086  private:
00088   inline void EvalStart(char c);
00089   inline void EvalTag(char c);
00090   inline void EvalTagOpen(char c);
00091   inline void EvalTagClose(char c);
00092   inline void EvalTagCloseTerminate(char c);
00093   inline void EvalTagBriefClose(char c);
00094   inline void EvalTagBriefCloseAttr(char c);
00095   inline void EvalCommentStart1(char c);
00096   inline void EvalCommentStart2(char c);
00097   inline void EvalCommentBody(char c);
00098   inline void EvalCommentEnd1(char c);
00099   inline void EvalCommentEnd2(char c);
00100   inline void EvalCdataStart1(char c);
00101   inline void EvalCdataStart2(char c);
00102   inline void EvalCdataStart3(char c);
00103   inline void EvalCdataStart4(char c);
00104   inline void EvalCdataStart5(char c);
00105   inline void EvalCdataStart6(char c);
00106   inline void EvalCdataBody(char c);
00107   inline void EvalCdataEnd1(char c);
00108   inline void EvalCdataEnd2(char c);
00109   inline void EvalAttribute(char c);
00110   inline void EvalAttrName(char c);
00111   inline void EvalAttrEq(char c);
00112   inline void EvalAttrVal(char c);
00113   inline void EvalAttrValSq(char c);
00114   inline void EvalAttrValDq(char c);
00115   inline void EvalLiteralTag(char c);
00116   inline void EvalDirective(char c);
00117 
00120   void MakeElement();
00121 
00122   void MakeAttribute(bool has_value);
00123   void FinishAttribute(char c, bool has_value, bool brief_close);
00124 
00125   void EmitCdata();
00126   void EmitComment();
00127   void EmitLiteral();
00128   void EmitTagOpen(bool allow_implicit_close); 
00129   void EmitTagClose(HtmlElement::CloseStyle close_style);
00130   void EmitTagBriefClose();
00131   void EmitDirective();
00132   void Restart(char c);
00133 
00135   void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00136 
00147   HtmlElement* PopElementMatchingTag(const StringPiece& tag);
00148 
00149   HtmlElement* PopElement();
00150   void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style);
00151 
00155   static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }
00156 
00158   static inline bool IsLegalTagFirstChar(char c);
00160   static inline bool IsLegalTagChar(char c);
00161 
00163   static inline bool IsLegalAttrNameChar(char c);
00164 
00171   enum State {
00172     START,
00173     TAG,                   
00174     TAG_CLOSE,             
00175     TAG_CLOSE_TERMINATE,   
00176     TAG_OPEN,              
00177     TAG_BRIEF_CLOSE,       
00178     TAG_BRIEF_CLOSE_ATTR,  
00179     COMMENT_START1,        
00180     COMMENT_START2,        
00181     COMMENT_BODY,          
00182     COMMENT_END1,          
00183     COMMENT_END2,          
00184     CDATA_START1,          
00185     CDATA_START2,          
00186     CDATA_START3,          
00187     CDATA_START4,          
00188     CDATA_START5,          
00189     CDATA_START6,          
00190     CDATA_BODY,            
00191     CDATA_END1,            
00192     CDATA_END2,            
00193     TAG_ATTRIBUTE,         
00194     TAG_ATTR_NAME,         
00195     TAG_ATTR_NAME_SPACE,   
00196     TAG_ATTR_EQ,           
00197     TAG_ATTR_VAL,          
00198     TAG_ATTR_VALDQ,        
00199     TAG_ATTR_VALSQ,        
00200     LITERAL_TAG,           
00201     DIRECTIVE              
00202   };
00203 
00204   HtmlParse* html_parse_;
00205   State state_;
00206   GoogleString token_; 
00207   GoogleString literal_; 
00208   GoogleString attr_name_; 
00209   GoogleString attr_value_; 
00210   HtmlElement::QuoteStyle attr_quote_; 
00211   bool has_attr_value_; 
00212   HtmlElement* element_; 
00213   int line_;
00214   int tag_start_line_; 
00215   GoogleString id_;
00216   GoogleString literal_close_; 
00217 
00218   ContentType content_type_;
00219   DocType doctype_;
00220 
00221   std::vector<HtmlElement*> element_stack_;
00222 
00225   bool size_limit_exceeded_;
00228   bool skip_parsing_;
00229   int64 num_bytes_parsed_;
00230   int64 size_limit_;
00231 
00232   DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
00233 };
00234 
00235 }  
00236 
00237 #endif  ///< NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines