19 #ifndef PAGESPEED_KERNEL_HTML_HTML_PARSE_H_
20 #define PAGESPEED_KERNEL_HTML_HTML_PARSE_H_
43 namespace net_instaweb {
52 typedef std::set <const HtmlEvent*> ConstHtmlEventSet;
107 bool StartParseWithType(
const StringPiece&
url,
119 virtual bool StartParseId(
const StringPiece&
url,
const StringPiece&
id,
137 ParseTextInternal(content, size);
140 ParseTextInternal(sp.data(), sp.size());
157 virtual void Flush();
173 const StringPiece& contents);
174 HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
175 const StringPiece& literal);
176 HtmlCommentNode* NewCommentNode(HtmlElement* parent,
177 const StringPiece& contents);
178 HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
179 const StringPiece& contents);
180 HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
181 const StringPiece& contents);
182 void InsertScriptAfterCurrent(StringPiece text,
bool external);
183 void InsertScriptBeforeCurrent(StringPiece text,
bool external);
187 HtmlElement*
AppendAnchor(StringPiece link, StringPiece text,
188 HtmlElement* parent);
201 void InsertNodeAfterNode(
const HtmlNode* existing_node, HtmlNode* new_node);
210 void InsertElementAfterElement(
const HtmlNode* existing_element,
212 InsertNodeAfterNode(existing_element, new_element);
218 void PrependChild(
const HtmlElement* existing_parent, HtmlNode* new_child);
219 void AppendChild(
const HtmlElement* existing_parent, HtmlNode* new_child);
237 HtmlElement* new_parent);
294 bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);
300 HtmlElement* NewElement(HtmlElement* parent,
const StringPiece& str) {
301 return NewElement(parent, MakeName(str));
304 return NewElement(parent, MakeName(keyword));
306 HtmlElement* NewElement(HtmlElement* parent,
const HtmlName& name);
314 const StringPiece& value) {
316 HtmlElement::DOUBLE_QUOTE);
319 const StringPiece& value) {
321 HtmlElement::DOUBLE_QUOTE);
324 const StringPiece& escaped_value) {
325 return element->AddEscapedAttribute(MakeName(keyword), escaped_value,
326 HtmlElement::DOUBLE_QUOTE);
328 void SetAttributeName(HtmlElement::Attribute* attribute,
330 attribute->set_name(MakeName(keyword));
333 HtmlName MakeName(
const StringPiece& str);
336 bool IsRewritable(
const HtmlNode* node)
const;
343 void ClearElements();
380 MessageHandler* message_handler()
const {
return message_handler_; }
383 const char*
url()
const {
return url_.c_str(); }
386 const char* id()
const {
return id_.c_str(); }
387 int line_number()
const {
return line_number_; }
390 return StringPrintf(
"%s:%d",
id(), line_number());
398 void Info(
const char* filename,
int line,
const char* msg, ...)
400 void Warning(const
char* filename,
int line, const
char* msg, ...)
402 void Error(const
char* filename,
int line, const
char* msg, ...)
403 INSTAWEB_PRINTF_FORMAT(4, 5);
404 void FatalError(const
char* filename,
int line, const
char* msg, ...)
405 INSTAWEB_PRINTF_FORMAT(4, 5);
407 void InfoV(const
char* file,
int line, const
char *msg, va_list args);
408 void WarningV(const
char* file,
int line, const
char *msg, va_list args);
409 void ErrorV(const
char* file,
int line, const
char *msg, va_list args);
410 void FatalErrorV(const
char* file,
int line, const
char* msg, va_list args);
413 void InfoHere(const
char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
414 void WarningHere(const
char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
415 void ErrorHere(const
char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
416 void FatalErrorHere(const
char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
422 void InfoHereV(const
char *msg, va_list args) {
423 InfoV(id_.c_str(), line_number_, msg, args);
425 void WarningHereV(
const char *msg, va_list args) {
426 WarningV(id_.c_str(), line_number_, msg, args);
428 void ErrorHereV(
const char *msg, va_list args) {
429 ErrorV(id_.c_str(), line_number_, msg, args);
431 void FatalErrorHereV(
const char* msg, va_list args) {
432 FatalErrorV(id_.c_str(), line_number_, msg, args);
435 void AddElement(HtmlElement* element,
int line_number);
445 Timer* timer()
const {
return timer_; }
446 void set_log_rewrite_timing(
bool x) { log_rewrite_timing_ = x; }
473 dynamically_disabled_filter_list_ = list;
512 return can_modify_urls_;
516 typedef std::vector<HtmlFilter*> FilterVector;
517 typedef std::list<HtmlFilter*> FilterList;
518 typedef std::pair<HtmlNode*, HtmlEventList*> DeferredNode;
519 typedef std::map<const HtmlNode*, HtmlEventList*> NodeToEventListMap;
520 typedef std::map<HtmlFilter*, DeferredNode> FilterElementMap;
521 typedef std::set<const HtmlNode*> NodeSet;
527 void EndFinishParse();
536 virtual void ParseTextInternal(
const char* content,
int size);
540 if (!determine_filter_behavior_called_) {
541 determine_filter_behavior_called_ =
true;
542 can_modify_urls_ =
false;
547 void DetermineFilterListBehavior(
const FilterList& list) {
548 for (FilterList::const_iterator i = list.begin(); i != list.end(); ++i) {
549 CheckFilterBehavior(*i);
553 void CheckFilterBehavior(HtmlFilter* filter);
595 HtmlEventListIterator Last();
596 bool IsInEventWindow(
const HtmlEventListIterator& iter)
const;
597 void InsertNodeBeforeEvent(
const HtmlEventListIterator& event,
599 void InsertNodeAfterEvent(
const HtmlEventListIterator& event,
601 bool MoveCurrentBeforeEvent(
const HtmlEventListIterator& move_to);
602 bool IsDescendantOf(
const HtmlNode* possible_child,
607 void CheckParentFromAddEvent(
HtmlEvent* event);
608 void FixParents(
const HtmlEventListIterator& begin,
609 const HtmlEventListIterator& end_inclusive,
611 void CoalesceAdjacentCharactersNodes();
614 inline void NextEvent();
615 void ClearDeferredNodes();
616 inline bool IsRewritableIgnoringDeferral(
const HtmlNode* node)
const;
617 inline bool IsRewritableIgnoringEnd(
const HtmlNode* node)
const;
618 void SetupScript(StringPiece text,
bool external,
HtmlElement* script);
624 void set_coalesce_characters(
bool x) { coalesce_characters_ = x; }
625 size_t symbol_table_size()
const {
634 void DelayLiteralTag();
636 FilterVector event_listeners_;
637 SymbolTableSensitive string_table_;
640 Arena<HtmlNode> nodes_;
641 HtmlEventList queue_;
642 HtmlEventListIterator current_;
644 MessageHandler* message_handler_;
646 GoogleUrl google_url_;
649 bool skip_increment_;
650 bool determine_filter_behavior_called_;
651 bool can_modify_urls_;
652 bool determine_enabled_filters_called_;
653 bool need_sanity_check_;
654 bool coalesce_characters_;
655 bool need_coalesce_characters_;
657 bool log_rewrite_timing_;
658 bool running_filters_;
660 int64 parse_start_time_us_;
661 scoped_ptr<HtmlEvent> delayed_start_literal_;
663 HtmlFilter* current_filter_;
670 FilterElementMap open_deferred_nodes_;
673 NodeToEventListMap deferred_nodes_;
679 NodeSet deferred_deleted_nodes_;
681 StringVector* dynamically_disabled_filter_list_;
void ApplyFilter(HtmlFilter *filter)
Run a filter on the current queue of parse nodes.
class GoogleUrl
Definition: google_url.h:58
GoogleString UrlLine() const
Returns URL (or id) and line number as a string, to be used in messages.
Definition: html_parse.h:389
Definition: html_filter.h:35
void AddFilter(HtmlFilter *filter)
Application methods for parsing functions and adding filters.
const ContentType & kContentTypeHtml
HTML-like (i.e. rewritable) text:
void set_size_limit(int64 x)
Sets the limit on the maximum number of bytes that should be parsed.
void InsertElementBeforeElement(const HtmlNode *existing_element, HtmlNode *new_element)
Definition: html_parse.h:205
bool DeleteNode(HtmlNode *node)
Definition: html_event.h:31
void add_event_listener(HtmlFilter *listener)
void set_timer(Timer *timer)
Definition: html_parse.h:444
const GoogleUrl & google_url() const
Gets a parsed GoogleUrl& corresponding to url().
Definition: html_parse.h:385
friend class HtmlLexer
Implementation helper with detailed knowledge of html parsing libraries.
Definition: html_parse.h:352
bool StartParse(const StringPiece &url)
Definition: html_parse.h:104
static bool IsLiteralTag(HtmlName::Keyword keyword)
void PrependChild(const HtmlElement *existing_parent, HtmlNode *new_child)
Definition: html_parse.h:88
void InfoHere(const char *msg,...) INSTAWEB_PRINTF_FORMAT(2
Report error message with current parsing filename and linenumber.
bool can_modify_urls()
Returns whether the filter pipeline can rewrite urls.
Definition: html_parse.h:511
Definition: html_element.h:42
void void void void void ShowProgress(const char *message)
bool AddParentToSequence(HtmlNode *first, HtmlNode *last, HtmlElement *new_parent)
void set_buffer_events(bool x)
Definition: html_parse.h:584
void RestoreDeferredNode(HtmlNode *deferred_node)
bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const
void DebugPrintQueue()
Print the HtmlEvent queue_ to stdout for debugging.
void InsertNodeAfterCurrent(HtmlNode *new_node)
bool MoveCurrentBefore(HtmlNode *existing_node)
const char * url() const
Definition: html_parse.h:383
bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const
Determines whether a tag allows brief termination in HTML, e.g. <tag>
void InsertNodeBeforeNode(const HtmlNode *existing_node, HtmlNode *new_node)
void DetermineFiltersBehavior()
Calls DetermineFiltersBehaviorImpl in an idempotent way.
Definition: html_parse.h:539
bool MakeElementInvisible(HtmlElement *element)
const DocType & doctype() const
std::string GoogleString
PAGESPEED_KERNEL_BASE_STRING_H_.
Definition: string.h:24
Style
Definition: html_element.h:50
void Info(const char *filename, int line, const char *msg,...) INSTAWEB_PRINTF_FORMAT(4
Interface for any caller to report an error message via the message handler.
bool ReplaceNode(HtmlNode *existing_node, HtmlNode *new_node)
size_t string_bytes_allocated() const
Definition: symbol_table.h:67
bool is_url_valid() const
Returns whether the google_url() URL is valid.
Definition: html_parse.h:113
HtmlElement * CloneElement(HtmlElement *in_element)
void SetDynamicallyDisabledFilterList(StringVector *list)
Definition: html_parse.h:472
void ParseText(const char *content, int size)
Definition: html_parse.h:136
Definition: html_node.h:43
virtual bool StartParseId(const StringPiece &url, const StringPiece &id, const ContentType &content_type)
bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const
bool HasChildrenInFlushWindow(HtmlElement *element)
bool MoveCurrentInto(HtmlElement *new_parent)
void DisableFiltersInjectingScripts()
Definition: html_testing_peer.h:33
Definition: content_type.h:31
Keyword
Definition: html_name.h:39
bool DeleteSavingChildren(HtmlElement *element)
bool CanAppendChild(const HtmlNode *node) const
virtual void DetermineFiltersBehaviorImpl()
void AddAttribute(HtmlElement *element, HtmlName::Keyword keyword, const StringPiece &value)
Definition: html_parse.h:313
virtual void FinishParse()
size_t GetEventQueueSize()
Returns the number of events on the event queue.
Definition: message_handler.h:39
void SetUrlForTesting(const StringPiece &url)
void AddAttribute(const Attribute &attr)
bool InsertComment(StringPiece sp)
bool size_limit_exceeded() const
Returns whether we have exceeded the size limit.
Definition: html_lexer.h:45
Timer interface, made virtual so it can be mocked for tests.
Definition: timer.h:27
HtmlElement * AppendAnchor(StringPiece link, StringPiece text, HtmlElement *parent)
void DebugLogQueue()
Log the HtmlEvent queue_ to the message_handler_ for debugging.
HtmlCdataNode * NewCdataNode(HtmlElement *parent, const StringPiece &contents)
Utility methods for implementing filters.
static bool IsSometimesLiteralTag(HtmlName::Keyword keyword)
void InsertNodeBeforeCurrent(HtmlNode *new_node)