Page Speed Optimization Libraries
1.4.26.1
|
00001 /* 00002 * Copyright 2010 Google Inc. 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * http:///www.apache.org/licenses/LICENSE-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 00018 00019 #ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ 00020 #define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ 00021 00022 #include <cstdarg> 00023 #include <cstddef> 00024 #include <list> 00025 #include <set> 00026 #include <vector> 00027 00028 #include "net/instaweb/util/public/basictypes.h" 00029 #include "net/instaweb/htmlparse/public/html_element.h" 00030 #include "net/instaweb/htmlparse/public/html_name.h" 00031 #include "net/instaweb/htmlparse/public/html_node.h" 00032 #include "net/instaweb/http/public/content_type.h" 00033 #include "net/instaweb/util/public/arena.h" 00034 #include "net/instaweb/util/public/google_url.h" 00035 #include "net/instaweb/util/public/printf_format.h" 00036 #include "net/instaweb/util/public/string.h" 00037 #include "net/instaweb/util/public/string_util.h" 00038 #include "net/instaweb/util/public/symbol_table.h" 00039 00040 namespace net_instaweb { 00041 00042 class DocType; 00043 class HtmlEvent; 00044 class HtmlFilter; 00045 class HtmlLexer; 00046 class MessageHandler; 00047 class Timer; 00048 00049 typedef std::set <const HtmlEvent*> ConstHtmlEventSet; 00050 00054 class HtmlParse { 00055 public: 00056 explicit HtmlParse(MessageHandler* message_handler); 00057 virtual ~HtmlParse(); 00058 00060 00063 void AddFilter(HtmlFilter* filter); 00064 00070 bool StartParse(const StringPiece& url) { 00071 return StartParseWithType(url, kContentTypeHtml); 00072 } 00073 bool StartParseWithType(const StringPiece& url, 00074 const ContentType& content_type) { 00075 return StartParseId(url, url, content_type); 00076 } 00077 00079 bool is_url_valid() const { return url_valid_; } 00080 00085 virtual bool StartParseId(const StringPiece& url, const StringPiece& id, 00086 const ContentType& content_type); 00087 00097 void ParseText(const char* content, int size) { 00098 ParseTextInternal(content, size); 00099 } 00100 void ParseText(const StringPiece& sp) { 00101 ParseTextInternal(sp.data(), sp.size()); 00102 } 00103 00118 virtual void Flush(); 00119 00124 virtual void FinishParse(); 00125 00126 00128 00129 HtmlCdataNode* NewCdataNode(HtmlElement* parent, 00130 const StringPiece& contents); 00131 HtmlCharactersNode* NewCharactersNode(HtmlElement* parent, 00132 const StringPiece& literal); 00133 HtmlCommentNode* NewCommentNode(HtmlElement* parent, 00134 const StringPiece& contents); 00135 HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent, 00136 const StringPiece& contents); 00137 HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent, 00138 const StringPiece& contents); 00139 00143 00150 void InsertElementBeforeElement(const HtmlNode* existing_node, 00151 HtmlNode* new_node); 00152 void InsertElementAfterElement(const HtmlNode* existing_node, 00153 HtmlNode* new_node); 00154 00158 void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child); 00159 void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child); 00160 00163 void InsertElementBeforeCurrent(HtmlNode* new_node); 00164 00169 void InsertElementAfterCurrent(HtmlNode* new_node); 00170 00176 bool AddParentToSequence(HtmlNode* first, HtmlNode* last, 00177 HtmlElement* new_parent); 00178 00187 bool MoveCurrentInto(HtmlElement* new_parent); 00188 00194 bool MoveCurrentBefore(HtmlNode* existing_node); 00195 00199 bool DeleteElement(HtmlNode* node); 00200 00203 bool DeleteSavingChildren(HtmlElement* element); 00204 00217 bool HasChildrenInFlushWindow(HtmlElement* element); 00218 00221 bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node); 00222 00225 HtmlElement* CloneElement(HtmlElement* in_element); 00226 00227 HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) { 00228 return NewElement(parent, MakeName(str)); 00229 } 00230 HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) { 00231 return NewElement(parent, MakeName(keyword)); 00232 } 00233 HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name); 00234 00235 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00236 const StringPiece& value) { 00237 return element->AddAttribute(MakeName(keyword), value, 00238 HtmlElement::DOUBLE_QUOTE); 00239 } 00240 void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00241 const StringPiece& escaped_value) { 00242 return element->AddEscapedAttribute(MakeName(keyword), escaped_value, 00243 HtmlElement::DOUBLE_QUOTE); 00244 } 00245 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00246 int value) { 00247 return AddAttribute(element, keyword, IntegerToString(value)); 00248 } 00249 void SetAttributeName(HtmlElement::Attribute* attribute, 00250 HtmlName::Keyword keyword) { 00251 attribute->set_name(MakeName(keyword)); 00252 } 00253 00254 HtmlName MakeName(const StringPiece& str); 00255 HtmlName MakeName(HtmlName::Keyword keyword); 00256 00257 bool IsRewritable(const HtmlNode* node) const; 00258 00259 void ClearElements(); 00260 00262 void DebugLogQueue(); 00263 00265 void DebugPrintQueue(); 00266 00268 friend class HtmlLexer; 00269 00272 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; 00273 00277 static bool IsLiteralTag(HtmlName::Keyword keyword); 00278 00286 static bool IsSometimesLiteralTag(HtmlName::Keyword keyword); 00287 00291 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; 00292 00294 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; 00295 00296 MessageHandler* message_handler() const { return message_handler_; } 00299 const char* url() const { return url_.c_str(); } 00301 const GoogleUrl& google_url() const { return google_url_; } 00302 const char* id() const { return id_.c_str(); } 00303 int line_number() const { return line_number_; } 00305 GoogleString UrlLine() const { 00306 return StringPrintf("%s:%d", id(), line_number()); 00307 } 00308 00311 const DocType& doctype() const; 00312 00314 void Info(const char* filename, int line, const char* msg, ...) 00315 INSTAWEB_PRINTF_FORMAT(4, 5); 00316 void Warning(const char* filename, int line, const char* msg, ...) 00317 INSTAWEB_PRINTF_FORMAT(4, 5); 00318 void Error(const char* filename, int line, const char* msg, ...) 00319 INSTAWEB_PRINTF_FORMAT(4, 5); 00320 void FatalError(const char* filename, int line, const char* msg, ...) 00321 INSTAWEB_PRINTF_FORMAT(4, 5); 00322 00323 void InfoV(const char* file, int line, const char *msg, va_list args); 00324 void WarningV(const char* file, int line, const char *msg, va_list args); 00325 void ErrorV(const char* file, int line, const char *msg, va_list args); 00326 void FatalErrorV(const char* file, int line, const char* msg, va_list args); 00327 00329 void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00330 void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00331 void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00332 void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00333 00336 void ShowProgress(const char* message); 00337 00338 void InfoHereV(const char *msg, va_list args) { 00339 InfoV(id_.c_str(), line_number_, msg, args); 00340 } 00341 void WarningHereV(const char *msg, va_list args) { 00342 WarningV(id_.c_str(), line_number_, msg, args); 00343 } 00344 void ErrorHereV(const char *msg, va_list args) { 00345 ErrorV(id_.c_str(), line_number_, msg, args); 00346 } 00347 void FatalErrorHereV(const char* msg, va_list args) { 00348 FatalErrorV(id_.c_str(), line_number_, msg, args); 00349 } 00350 00351 void AddElement(HtmlElement* element, int line_number); 00352 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style, 00353 int line_number); 00354 00356 void ApplyFilter(HtmlFilter* filter); 00357 00360 void set_timer(Timer* timer) { timer_ = timer; } 00361 Timer* timer() const { return timer_; } 00362 void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; } 00363 00366 void add_event_listener(HtmlFilter* listener); 00367 00371 void InsertComment(const StringPiece& sp); 00372 00374 void set_size_limit(int64 x); 00376 bool size_limit_exceeded() const; 00377 00378 protected: 00379 typedef std::vector<HtmlFilter*> FilterVector; 00380 typedef std::list<HtmlFilter*> FilterList; 00381 00385 void BeginFinishParse(); 00386 void EndFinishParse(); 00387 00389 size_t GetEventQueueSize(); 00390 00391 virtual void ParseTextInternal(const char* content, int size); 00392 00394 void DetermineEnabledFilters(FilterVector* filters) const; 00395 00396 private: 00397 void ApplyFilterHelper(HtmlFilter* filter); 00398 HtmlEventListIterator Last(); 00399 bool IsInEventWindow(const HtmlEventListIterator& iter) const; 00400 void InsertElementBeforeEvent(const HtmlEventListIterator& event, 00401 HtmlNode* new_node); 00402 void InsertElementAfterEvent(const HtmlEventListIterator& event, 00403 HtmlNode* new_node); 00404 bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to); 00405 bool IsDescendantOf(const HtmlNode* possible_child, 00406 const HtmlNode* possible_parent); 00407 void SanityCheck(); 00408 void CheckEventParent(HtmlEvent* event, HtmlElement* expect, 00409 HtmlElement* actual); 00410 void CheckParentFromAddEvent(HtmlEvent* event); 00411 void FixParents(const HtmlEventListIterator& begin, 00412 const HtmlEventListIterator& end_inclusive, 00413 HtmlElement* new_parent); 00414 void CoalesceAdjacentCharactersNodes(); 00415 void ClearEvents(); 00416 void EmitQueue(MessageHandler* handler); 00417 00419 friend class HtmlTestingPeer; 00420 void AddEvent(HtmlEvent* event); 00421 void SetCurrent(HtmlNode* node); 00422 void set_coalesce_characters(bool x) { coalesce_characters_ = x; } 00423 size_t symbol_table_size() const { 00424 return string_table_.string_bytes_allocated(); 00425 } 00426 00427 FilterVector event_listeners_; 00428 SymbolTableSensitive string_table_; 00429 FilterVector filters_; 00430 HtmlLexer* lexer_; 00431 Arena<HtmlNode> nodes_; 00432 HtmlEventList queue_; 00433 HtmlEventListIterator current_; 00435 MessageHandler* message_handler_; 00436 GoogleString url_; 00437 GoogleUrl google_url_; 00438 GoogleString id_; 00439 int line_number_; 00440 bool deleted_current_; 00441 bool need_sanity_check_; 00442 bool coalesce_characters_; 00443 bool need_coalesce_characters_; 00444 bool url_valid_; 00445 bool log_rewrite_timing_; 00446 bool running_filters_; 00447 int64 parse_start_time_us_; 00448 Timer* timer_; 00449 int first_filter_; 00450 00451 DISALLOW_COPY_AND_ASSIGN(HtmlParse); 00452 }; 00453 00454 } 00455 00456 #endif ///< NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_