Page Speed Optimization Libraries
1.2.24.1
|
00001 /* 00002 * Copyright 2010 Google Inc. 00003 * 00004 * Licensed under the Apache License, Version 2.0 (the "License"); 00005 * you may not use this file except in compliance with the License. 00006 * You may obtain a copy of the License at 00007 * 00008 * http:///www.apache.org/licenses/LICENSE-2.0 00009 * 00010 * Unless required by applicable law or agreed to in writing, software 00011 * distributed under the License is distributed on an "AS IS" BASIS, 00012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00013 * See the License for the specific language governing permissions and 00014 * limitations under the License. 00015 */ 00016 00018 00019 #ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ 00020 #define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_ 00021 00022 #include <cstdarg> 00023 #include <cstddef> 00024 #include <list> 00025 #include <set> 00026 #include <vector> 00027 00028 #include "net/instaweb/util/public/basictypes.h" 00029 #include "net/instaweb/htmlparse/public/html_element.h" 00030 #include "net/instaweb/htmlparse/public/html_name.h" 00031 #include "net/instaweb/htmlparse/public/html_node.h" 00032 #include "net/instaweb/http/public/content_type.h" 00033 #include "net/instaweb/util/public/arena.h" 00034 #include "net/instaweb/util/public/google_url.h" 00035 #include "net/instaweb/util/public/printf_format.h" 00036 #include "net/instaweb/util/public/string.h" 00037 #include "net/instaweb/util/public/string_util.h" 00038 #include "net/instaweb/util/public/symbol_table.h" 00039 00040 namespace net_instaweb { 00041 00042 class DocType; 00043 class HtmlEvent; 00044 class HtmlFilter; 00045 class HtmlLexer; 00046 class MessageHandler; 00047 class Timer; 00048 00049 typedef std::set <const HtmlEvent*> ConstHtmlEventSet; 00050 00054 class HtmlParse { 00055 public: 00056 explicit HtmlParse(MessageHandler* message_handler); 00057 virtual ~HtmlParse(); 00058 00060 00063 void AddFilter(HtmlFilter* filter); 00064 00070 bool StartParse(const StringPiece& url) { 00071 return StartParseWithType(url, kContentTypeHtml); 00072 } 00073 bool StartParseWithType(const StringPiece& url, 00074 const ContentType& content_type) { 00075 return StartParseId(url, url, content_type); 00076 } 00077 00079 bool is_url_valid() const { return url_valid_; } 00080 00085 virtual bool StartParseId(const StringPiece& url, const StringPiece& id, 00086 const ContentType& content_type); 00087 00097 void ParseText(const char* content, int size) { 00098 ParseTextInternal(content, size); 00099 } 00100 void ParseText(const StringPiece& sp) { 00101 ParseTextInternal(sp.data(), sp.size()); 00102 } 00103 00118 virtual void Flush(); 00119 00124 virtual void FinishParse(); 00125 00126 00128 00129 HtmlCdataNode* NewCdataNode(HtmlElement* parent, 00130 const StringPiece& contents); 00131 HtmlCharactersNode* NewCharactersNode(HtmlElement* parent, 00132 const StringPiece& literal); 00133 HtmlCommentNode* NewCommentNode(HtmlElement* parent, 00134 const StringPiece& contents); 00135 HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent, 00136 const StringPiece& contents); 00137 HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent, 00138 const StringPiece& contents); 00139 00143 00150 void InsertElementBeforeElement(const HtmlNode* existing_node, 00151 HtmlNode* new_node); 00152 void InsertElementAfterElement(const HtmlNode* existing_node, 00153 HtmlNode* new_node); 00154 00158 void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child); 00159 void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child); 00160 00163 void InsertElementBeforeCurrent(HtmlNode* new_node); 00164 00169 void InsertElementAfterCurrent(HtmlNode* new_node); 00170 00176 bool AddParentToSequence(HtmlNode* first, HtmlNode* last, 00177 HtmlElement* new_parent); 00178 00187 bool MoveCurrentInto(HtmlElement* new_parent); 00188 00194 bool MoveCurrentBefore(HtmlNode* existing_node); 00195 00199 bool DeleteElement(HtmlNode* node); 00200 00203 bool DeleteSavingChildren(HtmlElement* element); 00204 00217 bool HasChildrenInFlushWindow(HtmlElement* element); 00218 00221 bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node); 00222 00225 HtmlElement* CloneElement(HtmlElement* in_element); 00226 00227 HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) { 00228 return NewElement(parent, MakeName(str)); 00229 } 00230 HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) { 00231 return NewElement(parent, MakeName(keyword)); 00232 } 00233 HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name); 00234 00235 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00236 const StringPiece& value) { 00237 return element->AddAttribute(MakeName(keyword), value, 00238 HtmlElement::DOUBLE_QUOTE); 00239 } 00240 void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00241 const StringPiece& escaped_value) { 00242 return element->AddEscapedAttribute(MakeName(keyword), escaped_value, 00243 HtmlElement::DOUBLE_QUOTE); 00244 } 00245 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, 00246 int value) { 00247 return AddAttribute(element, keyword, IntegerToString(value)); 00248 } 00249 void SetAttributeName(HtmlElement::Attribute* attribute, 00250 HtmlName::Keyword keyword) { 00251 attribute->set_name(MakeName(keyword)); 00252 } 00253 00254 HtmlName MakeName(const StringPiece& str); 00255 HtmlName MakeName(HtmlName::Keyword keyword); 00256 00257 bool IsRewritable(const HtmlNode* node) const; 00258 00259 void ClearElements(); 00260 00262 void DebugLogQueue(); 00263 00265 void DebugPrintQueue(); 00266 00268 friend class HtmlLexer; 00269 00272 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; 00273 00277 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; 00278 00280 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; 00281 00282 MessageHandler* message_handler() const { return message_handler_; } 00285 const char* url() const { return url_.c_str(); } 00287 const GoogleUrl& google_url() const { return google_url_; } 00288 const char* id() const { return id_.c_str(); } 00289 int line_number() const { return line_number_; } 00291 GoogleString UrlLine() const { 00292 return StringPrintf("%s:%d", id(), line_number()); 00293 } 00294 00297 const DocType& doctype() const; 00298 00300 void Info(const char* filename, int line, const char* msg, ...) 00301 INSTAWEB_PRINTF_FORMAT(4, 5); 00302 void Warning(const char* filename, int line, const char* msg, ...) 00303 INSTAWEB_PRINTF_FORMAT(4, 5); 00304 void Error(const char* filename, int line, const char* msg, ...) 00305 INSTAWEB_PRINTF_FORMAT(4, 5); 00306 void FatalError(const char* filename, int line, const char* msg, ...) 00307 INSTAWEB_PRINTF_FORMAT(4, 5); 00308 00309 void InfoV(const char* file, int line, const char *msg, va_list args); 00310 void WarningV(const char* file, int line, const char *msg, va_list args); 00311 void ErrorV(const char* file, int line, const char *msg, va_list args); 00312 void FatalErrorV(const char* file, int line, const char* msg, va_list args); 00313 00315 void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00316 void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00317 void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00318 void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); 00319 00322 void ShowProgress(const char* message); 00323 00324 void InfoHereV(const char *msg, va_list args) { 00325 InfoV(id_.c_str(), line_number_, msg, args); 00326 } 00327 void WarningHereV(const char *msg, va_list args) { 00328 WarningV(id_.c_str(), line_number_, msg, args); 00329 } 00330 void ErrorHereV(const char *msg, va_list args) { 00331 ErrorV(id_.c_str(), line_number_, msg, args); 00332 } 00333 void FatalErrorHereV(const char* msg, va_list args) { 00334 FatalErrorV(id_.c_str(), line_number_, msg, args); 00335 } 00336 00337 void AddElement(HtmlElement* element, int line_number); 00338 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style, 00339 int line_number); 00340 00342 void ApplyFilter(HtmlFilter* filter); 00343 00346 void set_timer(Timer* timer) { timer_ = timer; } 00347 void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; } 00348 00351 void add_event_listener(HtmlFilter* listener); 00352 00356 void InsertComment(const StringPiece& sp); 00357 00359 void set_size_limit(int64 x); 00361 bool size_limit_exceeded() const; 00362 00363 protected: 00364 typedef std::vector<HtmlFilter*> FilterVector; 00365 typedef std::list<HtmlFilter*> FilterList; 00366 00370 void BeginFinishParse(); 00371 void EndFinishParse(); 00372 00374 size_t GetEventQueueSize(); 00375 00377 void AppendEventsToQueue(HtmlEventList* extra_events); 00378 00381 HtmlEvent* SplitQueueOnFirstEventInSet(const ConstHtmlEventSet& event_set, 00382 HtmlEventList* tail); 00383 00386 HtmlEvent* GetEndElementEvent(const HtmlElement* element); 00387 00388 virtual void ParseTextInternal(const char* content, int size); 00389 00391 void DetermineEnabledFilters(FilterVector* filters) const; 00392 00393 private: 00394 void ApplyFilterHelper(HtmlFilter* filter); 00395 HtmlEventListIterator Last(); 00396 bool IsInEventWindow(const HtmlEventListIterator& iter) const; 00397 void InsertElementBeforeEvent(const HtmlEventListIterator& event, 00398 HtmlNode* new_node); 00399 void InsertElementAfterEvent(const HtmlEventListIterator& event, 00400 HtmlNode* new_node); 00401 bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to); 00402 bool IsDescendantOf(const HtmlNode* possible_child, 00403 const HtmlNode* possible_parent); 00404 void SanityCheck(); 00405 void CheckEventParent(HtmlEvent* event, HtmlElement* expect, 00406 HtmlElement* actual); 00407 void CheckParentFromAddEvent(HtmlEvent* event); 00408 void FixParents(const HtmlEventListIterator& begin, 00409 const HtmlEventListIterator& end_inclusive, 00410 HtmlElement* new_parent); 00411 void CoalesceAdjacentCharactersNodes(); 00412 void ClearEvents(); 00413 void EmitQueue(MessageHandler* handler); 00414 00416 friend class HtmlTestingPeer; 00417 void AddEvent(HtmlEvent* event); 00418 void SetCurrent(HtmlNode* node); 00419 void set_coalesce_characters(bool x) { coalesce_characters_ = x; } 00420 size_t symbol_table_size() const { 00421 return string_table_.string_bytes_allocated(); 00422 } 00423 00424 FilterVector event_listeners_; 00425 SymbolTableSensitive string_table_; 00426 FilterVector filters_; 00427 HtmlLexer* lexer_; 00428 Arena<HtmlNode> nodes_; 00429 HtmlEventList queue_; 00430 HtmlEventListIterator current_; 00432 MessageHandler* message_handler_; 00433 GoogleString url_; 00434 GoogleUrl google_url_; 00435 GoogleString id_; 00436 int line_number_; 00437 bool deleted_current_; 00438 bool need_sanity_check_; 00439 bool coalesce_characters_; 00440 bool need_coalesce_characters_; 00441 bool url_valid_; 00442 bool log_rewrite_timing_; 00443 bool running_filters_; 00444 int64 parse_start_time_us_; 00445 Timer* timer_; 00446 int first_filter_; 00447 00448 DISALLOW_COPY_AND_ASSIGN(HtmlParse); 00449 }; 00450 00451 } 00452 00453 #endif ///< NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_