Page Speed Optimization Libraries  1.5.27.2
net/instaweb/htmlparse/public/html_parse.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Google Inc.
00003  *
00004  * Licensed under the Apache License, Version 2.0 (the "License");
00005  * you may not use this file except in compliance with the License.
00006  * You may obtain a copy of the License at
00007  *
00008  *      http:///www.apache.org/licenses/LICENSE-2.0
00009  *
00010  * Unless required by applicable law or agreed to in writing, software
00011  * distributed under the License is distributed on an "AS IS" BASIS,
00012  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013  * See the License for the specific language governing permissions and
00014  * limitations under the License.
00015  */
00016 
00018 
00019 #ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
00020 #define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
00021 
00022 #include <cstdarg>
00023 #include <cstddef>
00024 #include <list>
00025 #include <set>
00026 #include <vector>
00027 
00028 #include "net/instaweb/util/public/basictypes.h"
00029 #include "net/instaweb/htmlparse/public/html_element.h"
00030 #include "net/instaweb/htmlparse/public/html_name.h"
00031 #include "net/instaweb/htmlparse/public/html_node.h"
00032 #include "net/instaweb/http/public/content_type.h"
00033 #include "net/instaweb/util/public/arena.h"
00034 #include "net/instaweb/util/public/google_url.h"
00035 #include "net/instaweb/util/public/printf_format.h"
00036 #include "net/instaweb/util/public/string.h"
00037 #include "net/instaweb/util/public/string_util.h"
00038 #include "net/instaweb/util/public/symbol_table.h"
00039 
00040 namespace net_instaweb {
00041 
00042 class DocType;
00043 class HtmlEvent;
00044 class HtmlFilter;
00045 class HtmlLexer;
00046 class MessageHandler;
00047 class Timer;
00048 
00049 typedef std::set <const HtmlEvent*> ConstHtmlEventSet;
00050 
00085 class HtmlParse {
00086  public:
00087   explicit HtmlParse(MessageHandler* message_handler);
00088   virtual ~HtmlParse();
00089 
00091 
00094   void AddFilter(HtmlFilter* filter);
00095 
00101   bool StartParse(const StringPiece& url) {
00102     return StartParseWithType(url, kContentTypeHtml);
00103   }
00104   bool StartParseWithType(const StringPiece& url,
00105                           const ContentType& content_type) {
00106     return StartParseId(url, url, content_type);
00107   }
00108 
00110   bool is_url_valid() const { return url_valid_; }
00111 
00116   virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
00117                             const ContentType& content_type);
00118 
00128   void ParseText(const char* content, int size) {
00129     ParseTextInternal(content, size);
00130   }
00131   void ParseText(const StringPiece& sp) {
00132     ParseTextInternal(sp.data(), sp.size());
00133   }
00134 
00149   virtual void Flush();
00150 
00155   virtual void FinishParse();
00156 
00157 
00159 
00160   HtmlCdataNode* NewCdataNode(HtmlElement* parent,
00161                               const StringPiece& contents);
00162   HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
00163                                         const StringPiece& literal);
00164   HtmlCommentNode* NewCommentNode(HtmlElement* parent,
00165                                   const StringPiece& contents);
00166   HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
00167                                       const StringPiece& contents);
00168   HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
00169                                           const StringPiece& contents);
00170 
00174 
00181   void InsertElementBeforeElement(const HtmlNode* existing_node,
00182                                   HtmlNode* new_node);
00183   void InsertElementAfterElement(const HtmlNode* existing_node,
00184                                  HtmlNode* new_node);
00185 
00189   void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child);
00190   void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child);
00191 
00194   void InsertElementBeforeCurrent(HtmlNode* new_node);
00195 
00200   void InsertElementAfterCurrent(HtmlNode* new_node);
00201 
00207   bool AddParentToSequence(HtmlNode* first, HtmlNode* last,
00208                            HtmlElement* new_parent);
00209 
00218   bool MoveCurrentInto(HtmlElement* new_parent);
00219 
00225   bool MoveCurrentBefore(HtmlNode* existing_node);
00226 
00230   bool DeleteElement(HtmlNode* node);
00231 
00234   bool DeleteSavingChildren(HtmlElement* element);
00235 
00248   bool HasChildrenInFlushWindow(HtmlElement* element);
00249 
00252   bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);
00253 
00256   HtmlElement* CloneElement(HtmlElement* in_element);
00257 
00258   HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) {
00259     return NewElement(parent, MakeName(str));
00260   }
00261   HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) {
00262     return NewElement(parent, MakeName(keyword));
00263   }
00264   HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name);
00265 
00266   void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00267                     const StringPiece& value) {
00268     return element->AddAttribute(MakeName(keyword), value,
00269                                  HtmlElement::DOUBLE_QUOTE);
00270   }
00271   void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00272                     const StringPiece& escaped_value) {
00273     return element->AddEscapedAttribute(MakeName(keyword), escaped_value,
00274                                         HtmlElement::DOUBLE_QUOTE);
00275   }
00276   void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00277                     int value) {
00278     return AddAttribute(element, keyword, IntegerToString(value));
00279   }
00280   void SetAttributeName(HtmlElement::Attribute* attribute,
00281                         HtmlName::Keyword keyword) {
00282     attribute->set_name(MakeName(keyword));
00283   }
00284 
00285   HtmlName MakeName(const StringPiece& str);
00286   HtmlName MakeName(HtmlName::Keyword keyword);
00287 
00288   bool IsRewritable(const HtmlNode* node) const;
00289 
00290   void ClearElements();
00291 
00293   void DebugLogQueue();
00294 
00296   void DebugPrintQueue();
00297 
00299   friend class HtmlLexer;
00300 
00303   bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
00304 
00308   static bool IsLiteralTag(HtmlName::Keyword keyword);
00309 
00317   static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);
00318 
00322   bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
00323 
00325   bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
00326 
00327   MessageHandler* message_handler() const { return message_handler_; }
00330   const char* url() const { return url_.c_str(); }
00332   const GoogleUrl& google_url() const { return google_url_; }
00333   const char* id() const { return id_.c_str(); }
00334   int line_number() const { return line_number_; }
00336   GoogleString UrlLine() const {
00337     return StringPrintf("%s:%d", id(), line_number());
00338   }
00339 
00342   const DocType& doctype() const;
00343 
00345   void Info(const char* filename, int line, const char* msg, ...)
00346       INSTAWEB_PRINTF_FORMAT(4, 5);
00347   void Warning(const char* filename, int line, const char* msg, ...)
00348       INSTAWEB_PRINTF_FORMAT(4, 5);
00349   void Error(const char* filename, int line, const char* msg, ...)
00350       INSTAWEB_PRINTF_FORMAT(4, 5);
00351   void FatalError(const char* filename, int line, const char* msg, ...)
00352       INSTAWEB_PRINTF_FORMAT(4, 5);
00353 
00354   void InfoV(const char* file, int line, const char *msg, va_list args);
00355   void WarningV(const char* file, int line, const char *msg, va_list args);
00356   void ErrorV(const char* file, int line, const char *msg, va_list args);
00357   void FatalErrorV(const char* file, int line, const char* msg, va_list args);
00358 
00360   void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00361   void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00362   void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00363   void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00364 
00367   void ShowProgress(const char* message);
00368 
00369   void InfoHereV(const char *msg, va_list args) {
00370     InfoV(id_.c_str(), line_number_, msg, args);
00371   }
00372   void WarningHereV(const char *msg, va_list args) {
00373     WarningV(id_.c_str(), line_number_, msg, args);
00374   }
00375   void ErrorHereV(const char *msg, va_list args) {
00376     ErrorV(id_.c_str(), line_number_, msg, args);
00377   }
00378   void FatalErrorHereV(const char* msg, va_list args) {
00379     FatalErrorV(id_.c_str(), line_number_, msg, args);
00380   }
00381 
00382   void AddElement(HtmlElement* element, int line_number);
00383   void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
00384                     int line_number);
00385 
00387   void ApplyFilter(HtmlFilter* filter);
00388 
00391   void set_timer(Timer* timer) { timer_ = timer; }
00392   Timer* timer() const { return timer_; }
00393   void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; }
00394 
00397   void add_event_listener(HtmlFilter* listener);
00398 
00408   bool InsertComment(StringPiece sp);
00409 
00411   void set_size_limit(int64 x);
00413   bool size_limit_exceeded() const;
00414 
00415  protected:
00416   typedef std::vector<HtmlFilter*> FilterVector;
00417   typedef std::list<HtmlFilter*> FilterList;
00418 
00422   void BeginFinishParse();
00423   void EndFinishParse();
00424 
00426   size_t GetEventQueueSize();
00427 
00428   virtual void ParseTextInternal(const char* content, int size);
00429 
00436   virtual void DetermineEnabledFilters();
00437 
00438  private:
00439   void ApplyFilterHelper(HtmlFilter* filter);
00440   HtmlEventListIterator Last(); 
00441   bool IsInEventWindow(const HtmlEventListIterator& iter) const;
00442   void InsertElementBeforeEvent(const HtmlEventListIterator& event,
00443                                 HtmlNode* new_node);
00444   void InsertElementAfterEvent(const HtmlEventListIterator& event,
00445                                HtmlNode* new_node);
00446   bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to);
00447   bool IsDescendantOf(const HtmlNode* possible_child,
00448                       const HtmlNode* possible_parent);
00449   void SanityCheck();
00450   void CheckEventParent(HtmlEvent* event, HtmlElement* expect,
00451                         HtmlElement* actual);
00452   void CheckParentFromAddEvent(HtmlEvent* event);
00453   void FixParents(const HtmlEventListIterator& begin,
00454                   const HtmlEventListIterator& end_inclusive,
00455                   HtmlElement* new_parent);
00456   void CoalesceAdjacentCharactersNodes();
00457   void ClearEvents();
00458   void EmitQueue(MessageHandler* handler);
00459 
00461   friend class HtmlTestingPeer;
00462   void AddEvent(HtmlEvent* event);
00463   void SetCurrent(HtmlNode* node);
00464   void set_coalesce_characters(bool x) { coalesce_characters_ = x; }
00465   size_t symbol_table_size() const {
00466     return string_table_.string_bytes_allocated();
00467   }
00468 
00469   FilterVector event_listeners_;
00470   SymbolTableSensitive string_table_;
00471   FilterVector filters_;
00472   HtmlLexer* lexer_;
00473   Arena<HtmlNode> nodes_;
00474   HtmlEventList queue_;
00475   HtmlEventListIterator current_;
00477   MessageHandler* message_handler_;
00478   GoogleString url_;
00479   GoogleUrl google_url_;
00480   GoogleString id_; 
00481   int line_number_;
00482   bool deleted_current_;
00483   bool determine_enabled_filters_called_;
00484   bool need_sanity_check_;
00485   bool coalesce_characters_;
00486   bool need_coalesce_characters_;
00487   bool url_valid_;
00488   bool log_rewrite_timing_; 
00489   bool running_filters_;
00490   int64 parse_start_time_us_;
00491   Timer* timer_;
00492 
00493   DISALLOW_COPY_AND_ASSIGN(HtmlParse);
00494 };
00495 
00496 }  
00497 
00498 #endif  ///< NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines