Page Speed Optimization Libraries
1.5.27.2
|
#include "html_parse.h"
Public Member Functions | |
HtmlParse (MessageHandler *message_handler) | |
void | AddFilter (HtmlFilter *filter) |
Application methods for parsing functions and adding filters. | |
bool | StartParse (const StringPiece &url) |
bool | StartParseWithType (const StringPiece &url, const ContentType &content_type) |
bool | is_url_valid () const |
Returns whether the google_url() URL is valid. | |
virtual bool | StartParseId (const StringPiece &url, const StringPiece &id, const ContentType &content_type) |
void | ParseText (const char *content, int size) |
void | ParseText (const StringPiece &sp) |
virtual void | Flush () |
virtual void | FinishParse () |
HtmlCdataNode * | NewCdataNode (HtmlElement *parent, const StringPiece &contents) |
Utility methods for implementing filters. | |
HtmlCharactersNode * | NewCharactersNode (HtmlElement *parent, const StringPiece &literal) |
HtmlCommentNode * | NewCommentNode (HtmlElement *parent, const StringPiece &contents) |
HtmlDirectiveNode * | NewDirectiveNode (HtmlElement *parent, const StringPiece &contents) |
HtmlIEDirectiveNode * | NewIEDirectiveNode (HtmlElement *parent, const StringPiece &contents) |
void | InsertElementBeforeElement (const HtmlNode *existing_node, HtmlNode *new_node) |
void | InsertElementAfterElement (const HtmlNode *existing_node, HtmlNode *new_node) |
void | PrependChild (const HtmlElement *existing_parent, HtmlNode *new_child) |
void | AppendChild (const HtmlElement *existing_parent, HtmlNode *new_child) |
void | InsertElementBeforeCurrent (HtmlNode *new_node) |
void | InsertElementAfterCurrent (HtmlNode *new_node) |
bool | AddParentToSequence (HtmlNode *first, HtmlNode *last, HtmlElement *new_parent) |
bool | MoveCurrentInto (HtmlElement *new_parent) |
bool | MoveCurrentBefore (HtmlNode *existing_node) |
bool | DeleteElement (HtmlNode *node) |
bool | DeleteSavingChildren (HtmlElement *element) |
bool | HasChildrenInFlushWindow (HtmlElement *element) |
bool | ReplaceNode (HtmlNode *existing_node, HtmlNode *new_node) |
HtmlElement * | CloneElement (HtmlElement *in_element) |
HtmlElement * | NewElement (HtmlElement *parent, const StringPiece &str) |
HtmlElement * | NewElement (HtmlElement *parent, HtmlName::Keyword keyword) |
HtmlElement * | NewElement (HtmlElement *parent, const HtmlName &name) |
void | AddAttribute (HtmlElement *element, HtmlName::Keyword keyword, const StringPiece &value) |
void | AddEscapedAttribute (HtmlElement *element, HtmlName::Keyword keyword, const StringPiece &escaped_value) |
void | AddAttribute (HtmlElement *element, HtmlName::Keyword keyword, int value) |
void | SetAttributeName (HtmlElement::Attribute *attribute, HtmlName::Keyword keyword) |
HtmlName | MakeName (const StringPiece &str) |
HtmlName | MakeName (HtmlName::Keyword keyword) |
bool | IsRewritable (const HtmlNode *node) const |
void | ClearElements () |
void | DebugLogQueue () |
Log the HtmlEvent queue_ to the message_handler_ for debugging. | |
void | DebugPrintQueue () |
Print the HtmlEvent queue_ to stdout for debugging. | |
bool | IsImplicitlyClosedTag (HtmlName::Keyword keyword) const |
bool | IsOptionallyClosedTag (HtmlName::Keyword keyword) const |
bool | TagAllowsBriefTermination (HtmlName::Keyword keyword) const |
Determines whether a tag allows brief termination in HTML, e.g. <tag> | |
MessageHandler * | message_handler () const |
const char * | url () const |
const GoogleUrl & | google_url () const |
Gets a parsed GoogleUrl& corresponding to url(). | |
const char * | id () const |
int | line_number () const |
GoogleString | UrlLine () const |
Returns URL (or id) and line number as a string, to be used in messages. | |
const DocType & | doctype () const |
void | Info (const char *filename, int line, const char *msg,...) INSTAWEB_PRINTF_FORMAT(4 |
Interface for any caller to report an error message via the message handler. | |
void void | Warning (const char *filename, int line, const char *msg,...) INSTAWEB_PRINTF_FORMAT(4 |
void void void | Error (const char *filename, int line, const char *msg,...) INSTAWEB_PRINTF_FORMAT(4 |
void void void void | FatalError (const char *filename, int line, const char *msg,...) INSTAWEB_PRINTF_FORMAT(4 |
void void void void void | InfoV (const char *file, int line, const char *msg, va_list args) |
void | WarningV (const char *file, int line, const char *msg, va_list args) |
void | ErrorV (const char *file, int line, const char *msg, va_list args) |
void | FatalErrorV (const char *file, int line, const char *msg, va_list args) |
void | InfoHere (const char *msg,...) INSTAWEB_PRINTF_FORMAT(2 |
Report error message with current parsing filename and linenumber. | |
void void | WarningHere (const char *msg,...) INSTAWEB_PRINTF_FORMAT(2 |
void void void | ErrorHere (const char *msg,...) INSTAWEB_PRINTF_FORMAT(2 |
void void void void | FatalErrorHere (const char *msg,...) INSTAWEB_PRINTF_FORMAT(2 |
void void void void void | ShowProgress (const char *message) |
void | InfoHereV (const char *msg, va_list args) |
void | WarningHereV (const char *msg, va_list args) |
void | ErrorHereV (const char *msg, va_list args) |
void | FatalErrorHereV (const char *msg, va_list args) |
void | AddElement (HtmlElement *element, int line_number) |
void | CloseElement (HtmlElement *element, HtmlElement::CloseStyle close_style, int line_number) |
void | ApplyFilter (HtmlFilter *filter) |
Run a filter on the current queue of parse nodes. | |
void | set_timer (Timer *timer) |
Timer * | timer () const |
void | set_log_rewrite_timing (bool x) |
void | add_event_listener (HtmlFilter *listener) |
bool | InsertComment (StringPiece sp) |
void | set_size_limit (int64 x) |
Sets the limit on the maximum number of bytes that should be parsed. | |
bool | size_limit_exceeded () const |
Returns whether we have exceeded the size limit. | |
Static Public Member Functions | |
static bool | IsLiteralTag (HtmlName::Keyword keyword) |
static bool | IsSometimesLiteralTag (HtmlName::Keyword keyword) |
Protected Types | |
typedef std::vector< HtmlFilter * > | FilterVector |
typedef std::list< HtmlFilter * > | FilterList |
Protected Member Functions | |
void | BeginFinishParse () |
void | EndFinishParse () |
size_t | GetEventQueueSize () |
Returns the number of events on the event queue. | |
virtual void | ParseTextInternal (const char *content, int size) |
virtual void | DetermineEnabledFilters () |
Friends | |
class | HtmlLexer |
Implementation helper with detailed knowledge of html parsing libraries. | |
class | HtmlTestingPeer |
Visible for testing only, via HtmlTestingPeer. |
Streaming Html Parser API. Callbacks defined in HtmlFilter are called on each parser token.
Any number of filters can be added to the Html Parser; they are organized in a chain. Each filter processes a stream of SAX events (HtmlEvent), interspersed by Flushes. The filter operates on the sequence of events between flushes (a flush-window), and the system passes the (possibly mutated) event-stream to the next filter.
An HTML Event is a lexical token provided by the parser, including: begin document end document begin element end element whitespace characters cdata comment
The parser retains the sequence of events as a data structure: list<HtmlEvent>. HtmlEvents are sent to filters (HtmlFilter), as follows: foreach filter in filter-chain foreach event in flush-window apply filter to event
Filters may mutate the event streams as they are being processed, and these mutations be seen by downstream filters. The filters can mutate any event that has not been flushed. Supported mutations include:
void net_instaweb::HtmlParse::add_event_listener | ( | HtmlFilter * | listener | ) |
Adds a filter to be called during parsing as new events are added. Takes ownership of the HtmlFilter passed in.
void net_instaweb::HtmlParse::AddFilter | ( | HtmlFilter * | filter | ) |
Application methods for parsing functions and adding filters.
Add a new html filter to the filter-chain, without taking ownership of it.
bool net_instaweb::HtmlParse::AddParentToSequence | ( | HtmlNode * | first, |
HtmlNode * | last, | ||
HtmlElement * | new_parent | ||
) |
Enclose element around two elements in a sequence. The first element must be the same as, or precede the last element in the event-stream, and this is not checked, but the two elements do not need to be adjacent. They must have the same parent to start with.
void net_instaweb::HtmlParse::BeginFinishParse | ( | ) | [protected] |
HtmlParse::FinishParse() is equivalent to the sequence of BeginFinishParse(); Flush(); EndFinishParse(). Split up to permit asynchronous versions.
HtmlElement* net_instaweb::HtmlParse::CloneElement | ( | HtmlElement * | in_element | ) |
Creates an another element with the same name and attributes as in_element. Does not duplicate the children or insert it anywhere.
bool net_instaweb::HtmlParse::DeleteElement | ( | HtmlNode * | node | ) |
If the given node is rewritable, delete it and all of its children (if any) and return true; otherwise, do nothing and return false. Note: Javascript appears to use removeChild for this.
bool net_instaweb::HtmlParse::DeleteSavingChildren | ( | HtmlElement * | element | ) |
Delete a parent element, retaining any children and moving them to reside under the parent's parent.
virtual void net_instaweb::HtmlParse::DetermineEnabledFilters | ( | ) | [protected, virtual] |
Call DetermineEnabled() on each filter. Should be called after the property cache lookup has finished since some filters depend on pcache results in their DetermineEnabled implementation. If a subclass has filters that the base HtmlParse doesn't know about, it should override this function and call DetermineEnabled on each of its filters, along with calling the base DetermineEnabledFilters.
Reimplemented in net_instaweb::RewriteDriver.
const DocType& net_instaweb::HtmlParse::doctype | ( | ) | const |
Return the current assumed doctype of the document (based on the content type and any HTML directives encountered so far).
virtual void net_instaweb::HtmlParse::FinishParse | ( | ) | [virtual] |
Finish a chunked parsing session. This also induces a Flush.
It is invalid to call FinishParse when the StartParse* routines returned false.
Reimplemented in net_instaweb::RewriteDriver.
virtual void net_instaweb::HtmlParse::Flush | ( | ) | [virtual] |
Flush the currently queued events through the filters. It is desirable for large web pages, particularly dynamically generated ones, to start getting delivered to the browser as soon as they are ready. On the other hand, rewriting is more powerful when more of the content can be considered for image/css/js spriting. This method should be called when the controlling network process wants to induce a new chunk of output. The less you call this function the better the rewriting will be.
It is invalid to call Flush when the StartParse* routines returned false.
If this is called from a Filter, the request will be deferred until after currently active filters are completed.
Reimplemented in net_instaweb::RewriteDriver.
bool net_instaweb::HtmlParse::HasChildrenInFlushWindow | ( | HtmlElement * | element | ) |
Determines whether the element, in the context of its flush window, has children. If the element is not rewritable, or has not been closed yet, or inserted into the DOM event stream, then 'false' is returned.
Note that the concept of the Flush Window is important because the knowledge of an element's children is not limited to the current event being presented to a Filter. A Filter can call this method in the StartElement of an event to see if any children are going to be coming. Of course, if the StartElement is at the end of a Flush window, then we won't know about the children, but IsRewritable will also be false.
bool net_instaweb::HtmlParse::InsertComment | ( | StringPiece | sp | ) |
Inserts a comment before or after the current node. The function tries to pick an intelligent place depending on the document structure and whether the current node is a start-element, end-element, or a leaf. Returns true if it successfully added the comment, and false if it was not safe for the comment to be inserted. This can happen when a comment is inserted in a literal element (script or style) after the opening tag has been flushed, but the closing tag has not been seen yet. In this case, the caller can buffer the messages until EndElement is reached and call InsertComment at that point.
void net_instaweb::HtmlParse::InsertElementAfterCurrent | ( | HtmlNode * | new_node | ) |
Insert a new element after the current one, moving current_ to the new element. In a Filter, the flush-loop will advance past this on the next iteration. Note: new_node must not already be in the DOM.
void net_instaweb::HtmlParse::InsertElementBeforeCurrent | ( | HtmlNode * | new_node | ) |
Insert a new element before the current one. current_ remains unchanged. Note: new_node must not already be in the DOM.
void net_instaweb::HtmlParse::InsertElementBeforeElement | ( | const HtmlNode * | existing_node, |
HtmlNode * | new_node | ||
) |
DOM-manipulation methods.
bool net_instaweb::HtmlParse::IsImplicitlyClosedTag | ( | HtmlName::Keyword | keyword | ) | const |
Determines whether a tag should be terminated in HTML, e.g. <meta ..>. We do not expect to see a close-tag for meta and should never insert one.
static bool net_instaweb::HtmlParse::IsLiteralTag | ( | HtmlName::Keyword | keyword | ) | [static] |
Determines whether a tag should be interpreted as a 'literal' tag. That is, a tag whose contents are not parsed until a corresponding matching end tag is encountered.
bool net_instaweb::HtmlParse::IsOptionallyClosedTag | ( | HtmlName::Keyword | keyword | ) | const |
An optionally closed tag ranges from
, which is typically not closed, but we infer the closing from context. Also consider <html>, which usually is closed but not always. E.g. www.google.com does not close its html tag.
static bool net_instaweb::HtmlParse::IsSometimesLiteralTag | ( | HtmlName::Keyword | keyword | ) | [static] |
Determines whether a tag is interpreted as a 'literal' tag in some user agents. Since some user agents will interpret the contents of these tags, our parser never treats them as literal tags. However, a filter that wants to insert new tags that should be processed by all user agents should not insert those tags into a tag that is sometimes parsed as a literal tag. Those filters can use this method to determine if they are within such a tag.
bool net_instaweb::HtmlParse::MoveCurrentBefore | ( | HtmlNode * | existing_node | ) |
Moves current node (and all children) directly before existing_node. Note: Will not work if called from StartElement() event.
This differs from InsertElementBeforeElement() because it moves the current node, which is already in the DOM, rather than adding a new node.
bool net_instaweb::HtmlParse::MoveCurrentInto | ( | HtmlElement * | new_parent | ) |
Moves current node (and all children) to an already-existing parent, where they will be placed as the last elements in that parent. Returns false if the operation could not be performed because either the node or its parent was partially or wholly flushed. Note: Will not work if called from StartElement() event.
This differs from AppendChild() because it moves the current node, which is already in the DOM, rather than adding a new node.
void net_instaweb::HtmlParse::ParseText | ( | const char * | content, |
int | size | ||
) | [inline] |
Parses an arbitrary block of an html file, queuing up the events. Call Flush to send the events through the Filter.
To parse an entire file, first call StartParse(), then call ParseText on the file contents (in whatever size chunks are convenient), then call FinishParse().
It is invalid to call ParseText when the StartParse* routines returned false.
void net_instaweb::HtmlParse::PrependChild | ( | const HtmlElement * | existing_parent, |
HtmlNode * | new_child | ||
) |
Add a new child element at the beginning or end of existing_parent's children. Named after Javascript's appendChild method. Note: new_child must not already be in the DOM.
bool net_instaweb::HtmlParse::ReplaceNode | ( | HtmlNode * | existing_node, |
HtmlNode * | new_node | ||
) |
If possible, replace the existing node with the new node and return true; otherwise, do nothing and return false.
void net_instaweb::HtmlParse::set_timer | ( | Timer * | timer | ) | [inline] |
Provide timer to helping to report timing of each filter. You must also set_log_rewrite_timing(true) to turn on this reporting.
void void void void void net_instaweb::HtmlParse::ShowProgress | ( | const char * | message | ) |
If set_log_rewrite_timing(true) has been called, logs the given message at info level with a timeset offset from the parsing start time,
bool net_instaweb::HtmlParse::StartParse | ( | const StringPiece & | url | ) | [inline] |
Initiate a chunked parsing session. Finish with FinishParse. The url is only used to resolve relative URLs; the contents are not directly fetched. The caller must supply the text and call ParseText.
Returns whether the URL is valid.
virtual bool net_instaweb::HtmlParse::StartParseId | ( | const StringPiece & | url, |
const StringPiece & | id, | ||
const ContentType & | content_type | ||
) | [virtual] |
Mostly useful for file-based rewriters so that messages can reference the HTML file and produce navigable errors.
Returns whether the URL is valid.
Reimplemented in net_instaweb::RewriteDriver.
const char* net_instaweb::HtmlParse::url | ( | ) | const [inline] |
Gets the current location information; typically to help with error messages.