00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
00020 #define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
00021
00022 #include <cstdarg>
00023 #include <cstddef>
00024 #include <list>
00025 #include <set>
00026 #include <vector>
00027
00028 #include "net/instaweb/util/public/basictypes.h"
00029 #include "net/instaweb/htmlparse/public/html_element.h"
00030 #include "net/instaweb/htmlparse/public/html_name.h"
00031 #include "net/instaweb/htmlparse/public/html_parser_types.h"
00032 #include "net/instaweb/http/public/content_type.h"
00033 #include "net/instaweb/util/public/arena.h"
00034 #include "net/instaweb/util/public/google_url.h"
00035 #include "net/instaweb/util/public/printf_format.h"
00036 #include "net/instaweb/util/public/string.h"
00037 #include "net/instaweb/util/public/string_util.h"
00038 #include "net/instaweb/util/public/symbol_table.h"
00039
00040 namespace net_instaweb {
00041
00042 class DocType;
00043 class HtmlCdataNode;
00044 class HtmlCharactersNode;
00045 class HtmlCommentNode;
00046 class HtmlDirectiveNode;
00047 class HtmlEvent;
00048 class HtmlFilter;
00049 class HtmlIEDirectiveNode;
00050 class HtmlLexer;
00051 class HtmlNode;
00052 class MessageHandler;
00053 class Timer;
00054
00055 typedef std::set <const HtmlEvent*> ConstHtmlEventSet;
00056
00060 class HtmlParse {
00061 public:
00062 explicit HtmlParse(MessageHandler* message_handler);
00063 virtual ~HtmlParse();
00064
00066
00069 void AddFilter(HtmlFilter* filter);
00070
00076 bool StartParse(const StringPiece& url) {
00077 return StartParseWithType(url, kContentTypeHtml);
00078 }
00079 bool StartParseWithType(const StringPiece& url,
00080 const ContentType& content_type) {
00081 return StartParseId(url, url, content_type);
00082 }
00083
00085 bool is_url_valid() const { return url_valid_; }
00086
00091 virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
00092 const ContentType& content_type);
00093
00103 void ParseText(const char* content, int size);
00104 void ParseText(const StringPiece& sp) {
00105 ParseText(sp.data(), sp.size());
00106 }
00107
00122 virtual void Flush();
00123
00128 virtual void FinishParse();
00129
00130
00132
00133 HtmlCdataNode* NewCdataNode(HtmlElement* parent,
00134 const StringPiece& contents);
00135 HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
00136 const StringPiece& literal);
00137 HtmlCommentNode* NewCommentNode(HtmlElement* parent,
00138 const StringPiece& contents);
00139 HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
00140 const StringPiece& contents);
00141 HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
00142 const StringPiece& contents);
00143
00147
00154 void InsertElementBeforeElement(const HtmlNode* existing_node,
00155 HtmlNode* new_node);
00156 void InsertElementAfterElement(const HtmlNode* existing_node,
00157 HtmlNode* new_node);
00158
00162 void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child);
00163 void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child);
00164
00167 void InsertElementBeforeCurrent(HtmlNode* new_node);
00168
00173 void InsertElementAfterCurrent(HtmlNode* new_node);
00174
00180 bool AddParentToSequence(HtmlNode* first, HtmlNode* last,
00181 HtmlElement* new_parent);
00182
00191 bool MoveCurrentInto(HtmlElement* new_parent);
00192
00198 bool MoveCurrentBefore(HtmlNode* existing_node);
00199
00203 bool DeleteElement(HtmlNode* node);
00204
00207 bool DeleteSavingChildren(HtmlElement* element);
00208
00221 bool HasChildrenInFlushWindow(HtmlElement* element);
00222
00225 bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);
00226
00229 HtmlElement* CloneElement(HtmlElement* in_element);
00230
00231 HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) {
00232 return NewElement(parent, MakeName(str));
00233 }
00234 HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) {
00235 return NewElement(parent, MakeName(keyword));
00236 }
00237 HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name);
00238
00239 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00240 const StringPiece& value) {
00241 return element->AddAttribute(MakeName(keyword), value,
00242 HtmlElement::DOUBLE_QUOTE);
00243 }
00244 void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00245 const StringPiece& escaped_value) {
00246 return element->AddEscapedAttribute(MakeName(keyword), escaped_value,
00247 HtmlElement::DOUBLE_QUOTE);
00248 }
00249 void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
00250 int value) {
00251 return AddAttribute(element, keyword, IntegerToString(value));
00252 }
00253 void SetAttributeName(HtmlElement::Attribute* attribute,
00254 HtmlName::Keyword keyword) {
00255 attribute->set_name(MakeName(keyword));
00256 }
00257
00258 HtmlName MakeName(const StringPiece& str);
00259 HtmlName MakeName(HtmlName::Keyword keyword);
00260
00261 bool IsRewritable(const HtmlNode* node) const;
00262
00263 void ClearElements();
00264
00266 void DebugLogQueue();
00267
00269 void DebugPrintQueue();
00270
00272 friend class HtmlLexer;
00273
00276 bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
00277
00281 bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
00282
00284 bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
00285
00286 MessageHandler* message_handler() const { return message_handler_; }
00289 const char* url() const { return url_.c_str(); }
00291 const GoogleUrl& google_url() const { return google_url_; }
00292 const char* id() const { return id_.c_str(); }
00293 int line_number() const { return line_number_; }
00295 GoogleString UrlLine() const {
00296 return StringPrintf("%s:%d", id(), line_number());
00297 }
00298
00301 const DocType& doctype() const;
00302
00304 void Info(const char* filename, int line, const char* msg, ...)
00305 INSTAWEB_PRINTF_FORMAT(4, 5);
00306 void Warning(const char* filename, int line, const char* msg, ...)
00307 INSTAWEB_PRINTF_FORMAT(4, 5);
00308 void Error(const char* filename, int line, const char* msg, ...)
00309 INSTAWEB_PRINTF_FORMAT(4, 5);
00310 void FatalError(const char* filename, int line, const char* msg, ...)
00311 INSTAWEB_PRINTF_FORMAT(4, 5);
00312
00313 void InfoV(const char* file, int line, const char *msg, va_list args);
00314 void WarningV(const char* file, int line, const char *msg, va_list args);
00315 void ErrorV(const char* file, int line, const char *msg, va_list args);
00316 void FatalErrorV(const char* file, int line, const char* msg, va_list args);
00317
00319 void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00320 void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00321 void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00322 void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
00323
00326 void ShowProgress(const char* message);
00327
00328 void InfoHereV(const char *msg, va_list args) {
00329 InfoV(id_.c_str(), line_number_, msg, args);
00330 }
00331 void WarningHereV(const char *msg, va_list args) {
00332 WarningV(id_.c_str(), line_number_, msg, args);
00333 }
00334 void ErrorHereV(const char *msg, va_list args) {
00335 ErrorV(id_.c_str(), line_number_, msg, args);
00336 }
00337 void FatalErrorHereV(const char* msg, va_list args) {
00338 FatalErrorV(id_.c_str(), line_number_, msg, args);
00339 }
00340
00341 void AddElement(HtmlElement* element, int line_number);
00342 void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
00343 int line_number);
00344
00346 void ApplyFilter(HtmlFilter* filter);
00347
00350 void set_timer(Timer* timer) { timer_ = timer; }
00351 void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; }
00352
00355 void add_event_listener(HtmlFilter* listener);
00356
00357 protected:
00358 typedef std::vector<HtmlFilter*> FilterVector;
00359 typedef std::list<HtmlFilter*> FilterList;
00360
00364 void BeginFinishParse();
00365 void EndFinishParse();
00366
00368 size_t GetEventQueueSize();
00369
00371 void AppendEventsToQueue(HtmlEventList* extra_events);
00372
00375 HtmlEvent* SplitQueueOnFirstEventInSet(const ConstHtmlEventSet& event_set,
00376 HtmlEventList* tail);
00377
00380 HtmlEvent* GetEndElementEvent(const HtmlElement* element);
00381
00382 private:
00383 void ApplyFilterHelper(HtmlFilter* filter);
00384 HtmlEventListIterator Last();
00385 bool IsInEventWindow(const HtmlEventListIterator& iter) const;
00386 void InsertElementBeforeEvent(const HtmlEventListIterator& event,
00387 HtmlNode* new_node);
00388 void InsertElementAfterEvent(const HtmlEventListIterator& event,
00389 HtmlNode* new_node);
00390 bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to);
00391 bool IsDescendantOf(const HtmlNode* possible_child,
00392 const HtmlNode* possible_parent);
00393 void SanityCheck();
00394 void CheckEventParent(HtmlEvent* event, HtmlElement* expect,
00395 HtmlElement* actual);
00396 void CheckParentFromAddEvent(HtmlEvent* event);
00397 void FixParents(const HtmlEventListIterator& begin,
00398 const HtmlEventListIterator& end_inclusive,
00399 HtmlElement* new_parent);
00400 void CoalesceAdjacentCharactersNodes();
00401 void ClearEvents();
00402 void EmitQueue(MessageHandler* handler);
00403
00404
00406 friend class HtmlTestingPeer;
00407 void AddEvent(HtmlEvent* event);
00408 void SetCurrent(HtmlNode* node);
00409 void set_coalesce_characters(bool x) { coalesce_characters_ = x; }
00410 size_t symbol_table_size() const {
00411 return string_table_.string_bytes_allocated();
00412 }
00413
00414 FilterVector event_listeners_;
00415 SymbolTableSensitive string_table_;
00416 FilterVector filters_;
00417 HtmlLexer* lexer_;
00418 Arena<HtmlNode> nodes_;
00419 HtmlEventList queue_;
00420 HtmlEventListIterator current_;
00422 MessageHandler* message_handler_;
00423 GoogleString url_;
00424 GoogleUrl google_url_;
00425 GoogleString id_;
00426 int line_number_;
00427 bool deleted_current_;
00428 bool need_sanity_check_;
00429 bool coalesce_characters_;
00430 bool need_coalesce_characters_;
00431 bool url_valid_;
00432 bool log_rewrite_timing_;
00433 bool running_filters_;
00434 int64 parse_start_time_us_;
00435 Timer* timer_;
00436 int first_filter_;
00437
00438 DISALLOW_COPY_AND_ASSIGN(HtmlParse);
00439 };
00440
00441 }
00442
00443 #endif ///< NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_