Page Speed Optimization Libraries  1.2.24.1
net/instaweb/htmlparse/public/html_parse.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Google Inc.
00003  *
00004  * Licensed under the Apache License, Version 2.0 (the "License");
00005  * you may not use this file except in compliance with the License.
00006  * You may obtain a copy of the License at
00007  *
00008  *      http:///www.apache.org/licenses/LICENSE-2.0
00009  *
00010  * Unless required by applicable law or agreed to in writing, software
00011  * distributed under the License is distributed on an "AS IS" BASIS,
00012  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013  * See the License for the specific language governing permissions and
00014  * limitations under the License.
00015  */
00016 
00018 
00019 #ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
00020 #define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
00021 
00022 #include <cstdarg>
00023 #include <cstddef>
00024 #include <list>
00025 #include <set>
00026 #include <vector>
00027 
00028 #include "net/instaweb/util/public/basictypes.h"
00029 #include "net/instaweb/htmlparse/public/html_element.h"
00030 #include "net/instaweb/htmlparse/public/html_name.h"
00031 #include "net/instaweb/htmlparse/public/html_node.h"
00032 #include "net/instaweb/http/public/content_type.h"
00033 #include "net/instaweb/util/public/arena.h"
00034 #include "net/instaweb/util/public/google_url.h"
00035 #include "net/instaweb/util/public/printf_format.h"
00036 #include "net/instaweb/util/public/string.h"
00037 #include "net/instaweb/util/public/string_util.h"
00038 #include "net/instaweb/util/public/symbol_table.h"
00039 
00040 namespace net_instaweb {
00041 
00042 class DocType;
00043 class HtmlEvent;
00044 class HtmlFilter;
00045 class HtmlLexer;
00046 class MessageHandler;
00047 class Timer;
00048 
00049 typedef std::set <const HtmlEvent*> ConstHtmlEventSet;
00050 
00054 class HtmlParse {
00055  public:
00056   explicit HtmlParse(MessageHandler* message_handler);
00057   virtual ~HtmlParse();
00058 
00060 
00063   void AddFilter(HtmlFilter* filter);
00064 
00070   bool StartParse(const StringPiece& url) {
00071     return StartParseWithType(url, kContentTypeHtml);
00072   }
00073   bool StartParseWithType(const StringPiece& url,
00074                           const ContentType& content_type) {
00075     return StartParseId(url, url, content_type);
00076   }
00077 
00079   bool is_url_valid() const { return url_valid_; }
00080 
00085   virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
00086                             const ContentType& content_type);
00087 
00097   void ParseText(const char* content, int size) {
00098     ParseTextInternal(content, size);
00099   }
00100   void ParseText(const StringPiece& sp) {
00101     ParseTextInternal(sp.data(), sp.size());
00102   }
00103 
00118   virtual void Flush();
00119 
00124   virtual void FinishParse();
00125 
00126 
00128 
00129   HtmlCdataNode* NewCdataNode(HtmlElement* parent,
00130                               const StringPiece& contents);
00131   HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
00132                                         const StringPiece& literal);
00133   HtmlCommentNode* NewCommentNode(HtmlElement* parent,
00134                                   const StringPiece& contents);
00135   HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
00136                                       const StringPiece& contents);
00137   HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
00138                                           const StringPiece& contents);
00139 
00143 
00150   void InsertElementBeforeElement(const HtmlNode* existing_node,
00151                                   HtmlNode* new_node);
00152   void InsertElementAfterElement(const HtmlNode* existing_node,
00153                                  HtmlNode* new_node);
00154 
00158   void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child);
00159   void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child);
00160 
00163   void InsertElementBeforeCurrent(HtmlNode* new_node);
00164 
00169   void InsertElementAfterCurrent(HtmlNode* new_node);
00170 
00176   bool AddParentToSequence(HtmlNode* first, HtmlNode* last,
00177                            HtmlElement* new_parent);
00178 
00187   bool MoveCurrentInto(HtmlElement* new_parent);
00188 
00194   bool MoveCurrentBefore(HtmlNode* existing_node);
00195 
00199   bool DeleteElement(HtmlNode* node);
00200 
00203   bool DeleteSavingChildren(HtmlElement* element);
00204 
00217   bool HasChildrenInFlushWindow(HtmlElement* element);
00218 
00221   bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);
00222 
00225   HtmlElement* CloneElement(HtmlElement* in_element);
00226 
00227   HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) {
00228     return NewElement(parent, MakeName(str));
00229   }
00230   HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) {
00231     return NewElement(parent, MakeName(keyword));
00232   }
00233   HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name);
00234 
00235   void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00236                     const StringPiece& value) {
00237     return element->AddAttribute(MakeName(keyword), value,
00238                                  HtmlElement::DOUBLE_QUOTE);
00239   }
00240   void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00241                     const StringPiece& escaped_value) {
00242     return element->AddEscapedAttribute(MakeName(keyword), escaped_value,
00243                                         HtmlElement::DOUBLE_QUOTE);
00244   }
00245   void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00246                     int value) {
00247     return AddAttribute(element, keyword, IntegerToString(value));
00248   }
00249   void SetAttributeName(HtmlElement::Attribute* attribute,
00250                         HtmlName::Keyword keyword) {
00251     attribute->set_name(MakeName(keyword));
00252   }
00253 
00254   HtmlName MakeName(const StringPiece& str);
00255   HtmlName MakeName(HtmlName::Keyword keyword);
00256 
00257   bool IsRewritable(const HtmlNode* node) const;
00258 
00259   void ClearElements();
00260 
00262   void DebugLogQueue();
00263 
00265   void DebugPrintQueue();
00266 
00268   friend class HtmlLexer;
00269 
00272   bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
00273 
00277   bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
00278 
00280   bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
00281 
00282   MessageHandler* message_handler() const { return message_handler_; }
00285   const char* url() const { return url_.c_str(); }
00287   const GoogleUrl& google_url() const { return google_url_; }
00288   const char* id() const { return id_.c_str(); }
00289   int line_number() const { return line_number_; }
00291   GoogleString UrlLine() const {
00292     return StringPrintf("%s:%d", id(), line_number());
00293   }
00294 
00297   const DocType& doctype() const;
00298 
00300   void Info(const char* filename, int line, const char* msg, ...)
00301       INSTAWEB_PRINTF_FORMAT(4, 5);
00302   void Warning(const char* filename, int line, const char* msg, ...)
00303       INSTAWEB_PRINTF_FORMAT(4, 5);
00304   void Error(const char* filename, int line, const char* msg, ...)
00305       INSTAWEB_PRINTF_FORMAT(4, 5);
00306   void FatalError(const char* filename, int line, const char* msg, ...)
00307       INSTAWEB_PRINTF_FORMAT(4, 5);
00308 
00309   void InfoV(const char* file, int line, const char *msg, va_list args);
00310   void WarningV(const char* file, int line, const char *msg, va_list args);
00311   void ErrorV(const char* file, int line, const char *msg, va_list args);
00312   void FatalErrorV(const char* file, int line, const char* msg, va_list args);
00313 
00315   void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00316   void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00317   void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00318   void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00319 
00322   void ShowProgress(const char* message);
00323 
00324   void InfoHereV(const char *msg, va_list args) {
00325     InfoV(id_.c_str(), line_number_, msg, args);
00326   }
00327   void WarningHereV(const char *msg, va_list args) {
00328     WarningV(id_.c_str(), line_number_, msg, args);
00329   }
00330   void ErrorHereV(const char *msg, va_list args) {
00331     ErrorV(id_.c_str(), line_number_, msg, args);
00332   }
00333   void FatalErrorHereV(const char* msg, va_list args) {
00334     FatalErrorV(id_.c_str(), line_number_, msg, args);
00335   }
00336 
00337   void AddElement(HtmlElement* element, int line_number);
00338   void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
00339                     int line_number);
00340 
00342   void ApplyFilter(HtmlFilter* filter);
00343 
00346   void set_timer(Timer* timer) { timer_ = timer; }
00347   void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; }
00348 
00351   void add_event_listener(HtmlFilter* listener);
00352 
00356   void InsertComment(const StringPiece& sp);
00357 
00359   void set_size_limit(int64 x);
00361   bool size_limit_exceeded() const;
00362 
00363  protected:
00364   typedef std::vector<HtmlFilter*> FilterVector;
00365   typedef std::list<HtmlFilter*> FilterList;
00366 
00370   void BeginFinishParse();
00371   void EndFinishParse();
00372 
00374   size_t GetEventQueueSize();
00375 
00377   void AppendEventsToQueue(HtmlEventList* extra_events);
00378 
00381   HtmlEvent* SplitQueueOnFirstEventInSet(const ConstHtmlEventSet& event_set,
00382                                          HtmlEventList* tail);
00383 
00386   HtmlEvent* GetEndElementEvent(const HtmlElement* element);
00387 
00388   virtual void ParseTextInternal(const char* content, int size);
00389 
00391   void DetermineEnabledFilters(FilterVector* filters) const;
00392 
00393  private:
00394   void ApplyFilterHelper(HtmlFilter* filter);
00395   HtmlEventListIterator Last(); 
00396   bool IsInEventWindow(const HtmlEventListIterator& iter) const;
00397   void InsertElementBeforeEvent(const HtmlEventListIterator& event,
00398                                 HtmlNode* new_node);
00399   void InsertElementAfterEvent(const HtmlEventListIterator& event,
00400                                HtmlNode* new_node);
00401   bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to);
00402   bool IsDescendantOf(const HtmlNode* possible_child,
00403                       const HtmlNode* possible_parent);
00404   void SanityCheck();
00405   void CheckEventParent(HtmlEvent* event, HtmlElement* expect,
00406                         HtmlElement* actual);
00407   void CheckParentFromAddEvent(HtmlEvent* event);
00408   void FixParents(const HtmlEventListIterator& begin,
00409                   const HtmlEventListIterator& end_inclusive,
00410                   HtmlElement* new_parent);
00411   void CoalesceAdjacentCharactersNodes();
00412   void ClearEvents();
00413   void EmitQueue(MessageHandler* handler);
00414 
00416   friend class HtmlTestingPeer;
00417   void AddEvent(HtmlEvent* event);
00418   void SetCurrent(HtmlNode* node);
00419   void set_coalesce_characters(bool x) { coalesce_characters_ = x; }
00420   size_t symbol_table_size() const {
00421     return string_table_.string_bytes_allocated();
00422   }
00423 
00424   FilterVector event_listeners_;
00425   SymbolTableSensitive string_table_;
00426   FilterVector filters_;
00427   HtmlLexer* lexer_;
00428   Arena<HtmlNode> nodes_;
00429   HtmlEventList queue_;
00430   HtmlEventListIterator current_;
00432   MessageHandler* message_handler_;
00433   GoogleString url_;
00434   GoogleUrl google_url_;
00435   GoogleString id_; 
00436   int line_number_;
00437   bool deleted_current_;
00438   bool need_sanity_check_;
00439   bool coalesce_characters_;
00440   bool need_coalesce_characters_;
00441   bool url_valid_;
00442   bool log_rewrite_timing_; 
00443   bool running_filters_;
00444   int64 parse_start_time_us_;
00445   Timer* timer_;
00446   int first_filter_;
00447 
00448   DISALLOW_COPY_AND_ASSIGN(HtmlParse);
00449 };
00450 
00451 }  
00452 
00453 #endif  ///< NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines