Page Speed Optimization Libraries
1.5.27.2
|
00001 /* 00002 * Copyright 2010 Google Inc. 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * http:///www.apache.org/licenses/LICENSE-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 00018 00019 #ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ 00020 #define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ 00021 00022 #include <cstdarg> 00023 #include <cstddef> 00024 #include <list> 00025 #include <set> 00026 #include <vector> 00027 00028 #include "net/instaweb/util/public/basictypes.h" 00029 #include "net/instaweb/htmlparse/public/html_element.h" 00030 #include "net/instaweb/htmlparse/public/html_name.h" 00031 #include "net/instaweb/htmlparse/public/html_node.h" 00032 #include "net/instaweb/http/public/content_type.h" 00033 #include "net/instaweb/util/public/arena.h" 00034 #include "net/instaweb/util/public/google_url.h" 00035 #include "net/instaweb/util/public/printf_format.h" 00036 #include "net/instaweb/util/public/string.h" 00037 #include "net/instaweb/util/public/string_util.h" 00038 #include "net/instaweb/util/public/symbol_table.h" 00039 00040 namespace net_instaweb { 00041 00042 class DocType; 00043 class HtmlEvent; 00044 class HtmlFilter; 00045 class HtmlLexer; 00046 class MessageHandler; 00047 class Timer; 00048 00049 typedef std::set <const HtmlEvent*> ConstHtmlEventSet; 00050 00085 class HtmlParse { 00086 public: 00087 explicit HtmlParse(MessageHandler* message_handler); 00088 virtual ~HtmlParse(); 00089 00091 00094 void AddFilter(HtmlFilter* filter); 00095 00101 bool StartParse(const StringPiece& url) { 00102 return StartParseWithType(url, kContentTypeHtml); 00103 } 00104 bool StartParseWithType(const StringPiece& url, 00105 const ContentType& content_type) { 00106 return StartParseId(url, url, content_type); 00107 } 00108 00110 bool is_url_valid() const { return url_valid_; } 00111 00116 virtual bool StartParseId(const StringPiece& url, const StringPiece& id, 00117 const ContentType& content_type); 00118 00128 void ParseText(const char* content, int size) { 00129 ParseTextInternal(content, size); 00130 } 00131 void ParseText(const StringPiece& sp) { 00132 ParseTextInternal(sp.data(), sp.size()); 00133 } 00134 00149 virtual void Flush(); 00150 00155 virtual void FinishParse(); 00156 00157 00159 00160 HtmlCdataNode* NewCdataNode(HtmlElement* parent, 00161 const StringPiece& contents); 00162 HtmlCharactersNode* NewCharactersNode(HtmlElement* parent, 00163 const StringPiece& literal); 00164 HtmlCommentNode* NewCommentNode(HtmlElement* parent, 00165 const StringPiece& contents); 00166 HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent, 00167 const StringPiece& contents); 00168 HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent, 00169 const StringPiece& contents); 00170 00174 00181 void InsertElementBeforeElement(const HtmlNode* existing_node, 00182 HtmlNode* new_node); 00183 void InsertElementAfterElement(const HtmlNode* existing_node, 00184 HtmlNode* new_node); 00185 00189 void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child); 00190 void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child); 00191 00194 void InsertElementBeforeCurrent(HtmlNode* new_node); 00195 00200 void InsertElementAfterCurrent(HtmlNode* new_node); 00201 00207 bool AddParentToSequence(HtmlNode* first, HtmlNode* last, 00208 HtmlElement* new_parent); 00209 00218 bool MoveCurrentInto(HtmlElement* new_parent); 00219 00225 bool MoveCurrentBefore(HtmlNode* existing_node); 00226 00230 bool DeleteElement(HtmlNode* node); 00231 00234 bool DeleteSavingChildren(HtmlElement* element); 00235 00248 bool HasChildrenInFlushWindow(HtmlElement* element); 00249 00252 bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node); 00253 00256 HtmlElement* CloneElement(HtmlElement* in_element); 00257 00258 HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) { 00259 return NewElement(parent, MakeName(str)); 00260 } 00261 HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) { 00262 return NewElement(parent, MakeName(keyword)); 00263 } 00264 HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name); 00265 00266 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00267 const StringPiece& value) { 00268 return element->AddAttribute(MakeName(keyword), value, 00269 HtmlElement::DOUBLE_QUOTE); 00270 } 00271 void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00272 const StringPiece& escaped_value) { 00273 return element->AddEscapedAttribute(MakeName(keyword), escaped_value, 00274 HtmlElement::DOUBLE_QUOTE); 00275 } 00276 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00277 int value) { 00278 return AddAttribute(element, keyword, IntegerToString(value)); 00279 } 00280 void SetAttributeName(HtmlElement::Attribute* attribute, 00281 HtmlName::Keyword keyword) { 00282 attribute->set_name(MakeName(keyword)); 00283 } 00284 00285 HtmlName MakeName(const StringPiece& str); 00286 HtmlName MakeName(HtmlName::Keyword keyword); 00287 00288 bool IsRewritable(const HtmlNode* node) const; 00289 00290 void ClearElements(); 00291 00293 void DebugLogQueue(); 00294 00296 void DebugPrintQueue(); 00297 00299 friend class HtmlLexer; 00300 00303 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; 00304 00308 static bool IsLiteralTag(HtmlName::Keyword keyword); 00309 00317 static bool IsSometimesLiteralTag(HtmlName::Keyword keyword); 00318 00322 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; 00323 00325 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; 00326 00327 MessageHandler* message_handler() const { return message_handler_; } 00330 const char* url() const { return url_.c_str(); } 00332 const GoogleUrl& google_url() const { return google_url_; } 00333 const char* id() const { return id_.c_str(); } 00334 int line_number() const { return line_number_; } 00336 GoogleString UrlLine() const { 00337 return StringPrintf("%s:%d", id(), line_number()); 00338 } 00339 00342 const DocType& doctype() const; 00343 00345 void Info(const char* filename, int line, const char* msg, ...) 00346 INSTAWEB_PRINTF_FORMAT(4, 5); 00347 void Warning(const char* filename, int line, const char* msg, ...) 00348 INSTAWEB_PRINTF_FORMAT(4, 5); 00349 void Error(const char* filename, int line, const char* msg, ...) 00350 INSTAWEB_PRINTF_FORMAT(4, 5); 00351 void FatalError(const char* filename, int line, const char* msg, ...) 00352 INSTAWEB_PRINTF_FORMAT(4, 5); 00353 00354 void InfoV(const char* file, int line, const char *msg, va_list args); 00355 void WarningV(const char* file, int line, const char *msg, va_list args); 00356 void ErrorV(const char* file, int line, const char *msg, va_list args); 00357 void FatalErrorV(const char* file, int line, const char* msg, va_list args); 00358 00360 void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00361 void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00362 void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00363 void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00364 00367 void ShowProgress(const char* message); 00368 00369 void InfoHereV(const char *msg, va_list args) { 00370 InfoV(id_.c_str(), line_number_, msg, args); 00371 } 00372 void WarningHereV(const char *msg, va_list args) { 00373 WarningV(id_.c_str(), line_number_, msg, args); 00374 } 00375 void ErrorHereV(const char *msg, va_list args) { 00376 ErrorV(id_.c_str(), line_number_, msg, args); 00377 } 00378 void FatalErrorHereV(const char* msg, va_list args) { 00379 FatalErrorV(id_.c_str(), line_number_, msg, args); 00380 } 00381 00382 void AddElement(HtmlElement* element, int line_number); 00383 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style, 00384 int line_number); 00385 00387 void ApplyFilter(HtmlFilter* filter); 00388 00391 void set_timer(Timer* timer) { timer_ = timer; } 00392 Timer* timer() const { return timer_; } 00393 void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; } 00394 00397 void add_event_listener(HtmlFilter* listener); 00398 00408 bool InsertComment(StringPiece sp); 00409 00411 void set_size_limit(int64 x); 00413 bool size_limit_exceeded() const; 00414 00415 protected: 00416 typedef std::vector<HtmlFilter*> FilterVector; 00417 typedef std::list<HtmlFilter*> FilterList; 00418 00422 void BeginFinishParse(); 00423 void EndFinishParse(); 00424 00426 size_t GetEventQueueSize(); 00427 00428 virtual void ParseTextInternal(const char* content, int size); 00429 00436 virtual void DetermineEnabledFilters(); 00437 00438 private: 00439 void ApplyFilterHelper(HtmlFilter* filter); 00440 HtmlEventListIterator Last(); 00441 bool IsInEventWindow(const HtmlEventListIterator& iter) const; 00442 void InsertElementBeforeEvent(const HtmlEventListIterator& event, 00443 HtmlNode* new_node); 00444 void InsertElementAfterEvent(const HtmlEventListIterator& event, 00445 HtmlNode* new_node); 00446 bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to); 00447 bool IsDescendantOf(const HtmlNode* possible_child, 00448 const HtmlNode* possible_parent); 00449 void SanityCheck(); 00450 void CheckEventParent(HtmlEvent* event, HtmlElement* expect, 00451 HtmlElement* actual); 00452 void CheckParentFromAddEvent(HtmlEvent* event); 00453 void FixParents(const HtmlEventListIterator& begin, 00454 const HtmlEventListIterator& end_inclusive, 00455 HtmlElement* new_parent); 00456 void CoalesceAdjacentCharactersNodes(); 00457 void ClearEvents(); 00458 void EmitQueue(MessageHandler* handler); 00459 00461 friend class HtmlTestingPeer; 00462 void AddEvent(HtmlEvent* event); 00463 void SetCurrent(HtmlNode* node); 00464 void set_coalesce_characters(bool x) { coalesce_characters_ = x; } 00465 size_t symbol_table_size() const { 00466 return string_table_.string_bytes_allocated(); 00467 } 00468 00469 FilterVector event_listeners_; 00470 SymbolTableSensitive string_table_; 00471 FilterVector filters_; 00472 HtmlLexer* lexer_; 00473 Arena<HtmlNode> nodes_; 00474 HtmlEventList queue_; 00475 HtmlEventListIterator current_; 00477 MessageHandler* message_handler_; 00478 GoogleString url_; 00479 GoogleUrl google_url_; 00480 GoogleString id_; 00481 int line_number_; 00482 bool deleted_current_; 00483 bool determine_enabled_filters_called_; 00484 bool need_sanity_check_; 00485 bool coalesce_characters_; 00486 bool need_coalesce_characters_; 00487 bool url_valid_; 00488 bool log_rewrite_timing_; 00489 bool running_filters_; 00490 int64 parse_start_time_us_; 00491 Timer* timer_; 00492 00493 DISALLOW_COPY_AND_ASSIGN(HtmlParse); 00494 }; 00495 00496 } 00497 00498 #endif ///< NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_