00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
00020 #define NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
00021
00022 #include <map>
00023 #include <set>
00024 #include <vector>
00025 #include "base/logging.h"
00026 #include "base/scoped_ptr.h"
00027 #include "net/instaweb/htmlparse/public/html_element.h"
00028 #include "net/instaweb/htmlparse/public/html_parse.h"
00029 #include "net/instaweb/htmlparse/public/html_parser_types.h"
00030 #include "net/instaweb/http/public/http_cache.h"
00031 #include "net/instaweb/rewriter/public/output_resource_kind.h"
00032 #include "net/instaweb/rewriter/public/resource.h"
00033 #include "net/instaweb/rewriter/public/resource_manager.h"
00034 #include "net/instaweb/rewriter/public/resource_slot.h"
00035 #include "net/instaweb/rewriter/public/rewrite_options.h"
00036 #include "net/instaweb/rewriter/public/scan_filter.h"
00037 #include "net/instaweb/util/public/abstract_client_state.h"
00038 #include "net/instaweb/util/public/basictypes.h"
00039 #include "net/instaweb/util/public/google_url.h"
00040 #include "net/instaweb/util/public/printf_format.h"
00041 #include "net/instaweb/util/public/queued_worker_pool.h"
00042 #include "net/instaweb/util/public/scheduler.h"
00043 #include "net/instaweb/util/public/string.h"
00044 #include "net/instaweb/util/public/string_util.h"
00045 #include "net/instaweb/util/public/thread_system.h"
00046 #include "net/instaweb/util/public/url_segment_encoder.h"
00047
00048 namespace net_instaweb {
00049
00050 struct ContentType;
00051
00052 class AbstractMutex;
00053 class AddInstrumentationFilter;
00054 class AsyncFetch;
00055 class CacheUrlAsyncFetcher;
00056 class CommonFilter;
00057 class DomainRewriteFilter;
00058 class FileSystem;
00059 class Function;
00060 class HtmlEvent;
00061 class HtmlFilter;
00062 class HtmlWriterFilter;
00063 class MessageHandler;
00064 class PropertyPage;
00065 class RequestHeaders;
00066 class ResourceContext;
00067 class ResourceNamer;
00068 class ResponseHeaders;
00069 class RewriteContext;
00070 class RewriteFilter;
00071 class ScopedMutex;
00072 class Statistics;
00073 class UrlAsyncFetcher;
00074 class UrlLeftTrimFilter;
00075 class UserAgentMatcher;
00076 class Writer;
00077
00084 class RewriteDriver : public HtmlParse {
00085 public:
00087 enum CssResolutionStatus {
00088 kWriteFailed,
00089 kNoResolutionNeeded,
00090 kSuccess
00091 };
00092
00094 enum WaitMode {
00095 kNoWait,
00096 kWaitForCompletion,
00097 kWaitForCachedRender,
00098
00099 kWaitForShutDown
00100
00101 };
00102
00104 enum LazyBool {
00105 kNotSet = -1,
00106 kFalse = 0,
00107 kTrue = 1
00108 };
00109
00116 static const char* kPassThroughRequestAttributes[3];
00117
00128 static const char kDomCohort[];
00129
00130 RewriteDriver(MessageHandler* message_handler,
00131 FileSystem* file_system,
00132 UrlAsyncFetcher* url_async_fetcher);
00133
00136 virtual ~RewriteDriver();
00137
00139 RewriteDriver* Clone();
00140
00146 void Clear();
00147
00149 static void Initialize(Statistics* statistics);
00150
00152 static void Terminate();
00153
00156 void SetResourceManager(ResourceManager* resource_manager);
00157
00163 bool ShouldNotRewriteImages() const;
00164
00166 bool MayCacheExtendCss() const;
00167 bool MayCacheExtendImages() const;
00168 bool MayCacheExtendScripts() const;
00169
00170 void RememberResource(const StringPiece& url, const ResourcePtr& resource);
00171 const GoogleString& user_agent() const {
00172 return user_agent_;
00173 }
00174 void set_user_agent(const StringPiece& user_agent_string) {
00175 user_agent_string.CopyToString(&user_agent_);
00176 user_agent_is_bot_ = kNotSet;
00177 user_agent_supports_image_inlining_ = kNotSet;
00178 user_agent_supports_js_defer_ = kNotSet;
00179 user_agent_supports_webp_ = kNotSet;
00180 is_mobile_user_agent_ = kNotSet;
00181 }
00182
00184 bool using_spdy() const { return using_spdy_; }
00185 void set_using_spdy(bool x) { using_spdy_ = x; }
00186
00189 ResponseHeaders* mutable_response_headers() {
00190 return flush_occurred_ ? NULL : response_headers_;
00191 }
00192
00199 const ResponseHeaders* response_headers() {
00200 return response_headers_;
00201 }
00202
00206 void set_response_headers_ptr(ResponseHeaders* headers) {
00207 response_headers_ = headers;
00208 }
00209
00210 void set_request_headers(const RequestHeaders* headers) {
00211 request_headers_ = headers;
00212 }
00213
00214 const RequestHeaders* request_headers() const {
00215 DCHECK(request_headers_ != NULL);
00216 return request_headers_;
00217 }
00218
00219 const UserAgentMatcher& user_agent_matcher() const {
00220 DCHECK(resource_manager() != NULL);
00221 return resource_manager()->user_agent_matcher();
00222 }
00223 bool UserAgentSupportsImageInlining() const;
00224 bool UserAgentSupportsJsDefer() const;
00225 bool UserAgentSupportsWebp() const;
00226 bool IsMobileUserAgent() const;
00227
00232 void AddFilters();
00233
00237 void AddOwnedEarlyPreRenderFilter(HtmlFilter* filter);
00238
00240 void PrependOwnedPreRenderFilter(HtmlFilter* filter);
00242 void AppendOwnedPreRenderFilter(HtmlFilter* filter);
00243
00245 void AddOwnedPostRenderFilter(HtmlFilter* filter);
00247 void AddUnownedPostRenderFilter(HtmlFilter* filter);
00248
00260 void AppendRewriteFilter(RewriteFilter* filter);
00261
00264 void PrependRewriteFilter(RewriteFilter* filter);
00265
00271 void SetWriter(Writer* writer);
00272
00273 Writer* writer() const { return writer_; }
00274
00298 bool FetchResource(const StringPiece& url, AsyncFetch* fetch);
00299
00306 bool FetchOutputResource(const OutputResourcePtr& output_resource,
00307 RewriteFilter* filter,
00308 AsyncFetch* async_fetch);
00309
00316 OutputResourcePtr DecodeOutputResource(const GoogleUrl& url,
00317 RewriteFilter** filter) const;
00318
00322 bool DecodeOutputResourceName(const GoogleUrl& url,
00323 ResourceNamer* name_out,
00324 OutputResourceKind* kind_out,
00325 RewriteFilter** filter_out) const;
00326
00328 bool DecodeUrl(const GoogleUrl& url,
00329 StringVector* decoded_urls) const;
00330
00331 FileSystem* file_system() { return file_system_; }
00332 UrlAsyncFetcher* async_fetcher() { return url_async_fetcher_; }
00333
00338 void SetSessionFetcher(UrlAsyncFetcher* f);
00339
00342 CacheUrlAsyncFetcher* CreateCacheFetcher();
00343
00344 ResourceManager* resource_manager() const { return resource_manager_; }
00345 Statistics* statistics() const;
00346
00347 AddInstrumentationFilter* add_instrumentation_filter() {
00348 return add_instrumentation_filter_;
00349 }
00350
00352 void set_custom_options(RewriteOptions* options) {
00353 set_options(true, options);
00354 }
00355
00357 void set_options(bool is_custom, RewriteOptions* options) {
00358 has_custom_options_ = is_custom;
00359 options_.reset(options);
00360 }
00361
00363 bool has_custom_options() const { return has_custom_options_; }
00364
00366 const RewriteOptions* options() const { return options_.get(); }
00367
00369 virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
00370 const ContentType& content_type);
00371
00378 virtual void FinishParse();
00379
00382 void FinishParseAsync(Function* callback);
00383
00388 void InhibitEndElement(const HtmlElement* element);
00389
00396 void UninhibitEndElement(const HtmlElement* element);
00397
00399 bool EndElementIsInhibited(const HtmlElement* element);
00400
00404 bool EndElementIsStoppingFlush(const HtmlElement* element);
00405
00409 void InfoAt(const RewriteContext* context,
00410 const char* msg, ...) INSTAWEB_PRINTF_FORMAT(3, 4);
00411
00416
00423 OutputResourcePtr CreateOutputResourceFromResource(
00424 const StringPiece& filter_prefix,
00425 const UrlSegmentEncoder* encoder,
00426 const ResourceContext* data,
00427 const ResourcePtr& input_resource,
00428 OutputResourceKind kind);
00429
00443 OutputResourcePtr CreateOutputResourceWithPath(
00444 const StringPiece& mapped_path, const StringPiece& unmapped_path,
00445 const StringPiece& base_url, const StringPiece& filter_id,
00446 const StringPiece& name, OutputResourceKind kind);
00447
00450 OutputResourcePtr CreateOutputResourceWithUnmappedPath(
00451 const StringPiece& path, const StringPiece& filter_id,
00452 const StringPiece& name, OutputResourceKind kind) {
00453 return CreateOutputResourceWithPath(path, path, base_url_.AllExceptLeaf(),
00454 filter_id, name, kind);
00455 }
00456
00459 OutputResourcePtr CreateOutputResourceWithMappedPath(
00460 const StringPiece& mapped_path, const StringPiece& unmapped_path,
00461 const StringPiece& filter_id, const StringPiece& name,
00462 OutputResourceKind kind) {
00463 return CreateOutputResourceWithPath(mapped_path, unmapped_path,
00464 decoded_base_url_.AllExceptLeaf(),
00465 filter_id, name, kind);
00466 }
00467
00470 OutputResourcePtr CreateOutputResourceWithPath(
00471 const StringPiece& path, const StringPiece& filter_id,
00472 const StringPiece& name, OutputResourceKind kind) {
00473 return CreateOutputResourceWithPath(path, path, path, filter_id, name,
00474 kind);
00475 }
00476
00480 ResourcePtr CreateInputResource(const GoogleUrl& input_url);
00481
00485 ResourcePtr CreateInputResourceAbsoluteUnchecked(
00486 const StringPiece& absolute_url);
00487
00491 bool MayRewriteUrl(const GoogleUrl& domain_url,
00492 const GoogleUrl& input_url) const;
00493
00498 void ReadAsync(Resource::AsyncCallback* callback,
00499 MessageHandler* message_handler);
00500
00504 const GoogleUrl& base_url() const { return base_url_; }
00505
00509 const GoogleUrl& decoded_base_url() const { return decoded_base_url_; }
00510 StringPiece decoded_base() const { return decoded_base_url_.Spec(); }
00511
00513 bool IsHttps() const { return google_url().SchemeIs("https"); }
00514
00515 const UrlSegmentEncoder* default_encoder() const { return &default_encoder_; }
00516
00518 RewriteFilter* FindFilter(const StringPiece& id) const;
00519
00521 bool refs_before_base() { return refs_before_base_; }
00522
00527 void set_refs_before_base() { refs_before_base_ = true; }
00528
00533 StringPiece containing_charset() { return containing_charset_; }
00534 void set_containing_charset(const StringPiece charset) {
00535 charset.CopyToString(&containing_charset_);
00536 }
00537
00539 HtmlResourceSlotPtr GetSlot(const ResourcePtr& resource,
00540 HtmlElement* elt,
00541 HtmlElement::Attribute* attr);
00542
00546 void InitiateRewrite(RewriteContext* rewrite_context);
00547 void InitiateFetch(RewriteContext* rewrite_context);
00548
00552 void RewriteComplete(RewriteContext* rewrite_context);
00553
00557 void ReportSlowRewrites(int num);
00558
00563 void Cleanup();
00564
00567 void WaitForCompletion();
00568
00575 void WaitForShutDown();
00576
00580 void BoundedWaitFor(WaitMode mode, int64 timeout_ms);
00581
00589 void set_fully_rewrite_on_flush(bool x) {
00590 fully_rewrite_on_flush_ = x;
00591 }
00592
00599 void set_externally_managed(bool x) { externally_managed_ = x; }
00600
00604 void DetachFetch();
00605
00608 void DetachedFetchComplete();
00609
00613 void FetchComplete();
00614
00620 void DeleteRewriteContext(RewriteContext* rewrite_context);
00621
00627 void set_rewrite_deadline_ms(int x) { rewrite_deadline_ms_ = x; }
00628 int rewrite_deadline_ms() { return rewrite_deadline_ms_; }
00629
00635 RewriteContext* RegisterForPartitionKey(const GoogleString& partition_key,
00636 RewriteContext* candidate);
00637
00642 void DeregisterForPartitionKey(
00643 const GoogleString& partition_key, RewriteContext* candidate);
00644
00647 void RequestFlush() { flush_requested_ = true; }
00648 bool flush_requested() const { return flush_requested_; }
00649
00661 void ExecuteFlushIfRequested();
00662
00666 void ExecuteFlushIfRequestedAsync(Function* callback);
00667
00676 virtual void Flush();
00677
00683 void FlushAsync(Function* done);
00684
00686 void AddRewriteTask(Function* task);
00687
00690 void AddLowPriorityRewriteTask(Function* task);
00691
00692 QueuedWorkerPool::Sequence* html_worker() { return html_worker_; }
00693 QueuedWorkerPool::Sequence* rewrite_worker() { return rewrite_worker_; }
00694 QueuedWorkerPool::Sequence* low_priority_rewrite_worker() {
00695 return low_priority_rewrite_worker_;
00696 }
00697
00698 Scheduler* scheduler() { return scheduler_; }
00699
00702 DomainRewriteFilter* domain_rewriter() { return domain_rewriter_.get(); }
00703 UrlLeftTrimFilter* url_trim_filter() { return url_trim_filter_.get(); }
00704
00712 CssResolutionStatus ResolveCssUrls(const GoogleUrl& input_css_base,
00713 const StringPiece& output_css_base,
00714 const StringPiece& contents,
00715 Writer* writer,
00716 MessageHandler* handler);
00717
00725 bool ShouldAbsolutifyUrl(const GoogleUrl& input_base,
00726 const GoogleUrl& output_base,
00727 bool* proxy_mode) const;
00728
00731 void set_client_state(AbstractClientState* client_state) {
00732 client_state_.reset(client_state);
00733 }
00734
00738 AbstractClientState* client_state() const { return client_state_.get(); }
00739
00740 void set_client_id(const StringPiece& id) { client_id_ = id.as_string(); }
00741 const GoogleString& client_id() const { return client_id_; }
00742
00743 PropertyPage* property_page() const { return property_page_.get(); }
00744 void set_property_page(PropertyPage* page);
00745
00747 const StringSet* critical_images() const {
00748 return critical_images_.get();
00749 }
00750
00753 void set_critical_images(StringSet* critical_images) {
00754 critical_images_.reset(critical_images);
00755 }
00756
00760 int num_inline_preview_images() const { return num_inline_preview_images_; }
00761
00763 void increment_num_inline_preview_images();
00764
00768 void increment_async_events_count();
00769
00771 void decrement_async_events_count();
00772
00773 bool need_furious_cookie() const { return need_furious_cookie_; }
00774 void set_need_furious_cookie(bool x) { need_furious_cookie_ = x; }
00775
00776 private:
00777 friend class ResourceManagerTestBase;
00778 friend class ResourceManagerTest;
00779
00780 typedef std::map<GoogleString, RewriteFilter*> StringFilterMap;
00781 typedef void (RewriteDriver::*SetStringMethod)(const StringPiece& value);
00782 typedef void (RewriteDriver::*SetInt64Method)(int64 value);
00783
00788 void FetchCompleteImpl(bool signal, ScopedMutex* lock);
00789
00794 void CheckForCompletionAsync(WaitMode wait_mode, int64 timeout_ms,
00795 Function* done);
00796
00800 void TryCheckForCompletion(WaitMode wait_mode, int64 end_time_ms,
00801 Function* done);
00802
00804 bool IsDone(WaitMode wait_mode, bool deadline_reached);
00805
00809 void FlushAsyncDone(int num_rewrites, Function* callback);
00810
00812 void QueueFlushAsyncDone(int num_rewrites, Function* callback);
00813
00816 void QueueFinishParseAfterFlush(Function* user_callback);
00817 void FinishParseAfterFlush(Function* user_callback);
00818
00820 bool RewritesComplete() const;
00821
00826 bool HaveBackgroundFetchRewrite() const;
00827
00830 void SetBaseUrlIfUnset(const StringPiece& new_base);
00831
00834 void SetBaseUrlForFetch(const StringPiece& url);
00835
00838 void SetDecodedUrlFromBase();
00839
00841 AbstractMutex* rewrite_mutex() { return scheduler_->mutex(); }
00842
00843 friend class ScanFilter;
00844
00848 void AddCommonFilter(CommonFilter* filter);
00849
00853 void RegisterRewriteFilter(RewriteFilter* filter);
00854
00859 void EnableRewriteFilter(const char* id);
00860
00864 ResourcePtr CreateInputResourceUnchecked(const GoogleUrl& gurl);
00865
00866 void AddPreRenderFilters();
00867 void AddPostRenderFilters();
00868
00870 void UninhibitFlushDone(Function* user_callback);
00871
00873 void SplitQueueIfNecessary();
00874
00876 bool DecodeOutputResourceNameHelper(const GoogleUrl& url,
00877 ResourceNamer* name_out,
00878 OutputResourceKind* kind_out,
00879 RewriteFilter** filter_out,
00880 GoogleString* url_base,
00881 StringVector* urls) const;
00882
00886 void WriteDomCohortIntoPropertyCache();
00887
00890 void WriteClientStateIntoPropertyCache();
00891
00903 bool base_was_set_;
00904
00909 bool refs_before_base_;
00910
00912 GoogleString containing_charset_;
00913
00914 bool filters_added_;
00915 bool externally_managed_;
00916
00920 bool fetch_queued_;
00921
00926 bool fetch_detached_;
00927
00934 bool detached_fetch_main_path_complete_;
00935 bool detached_fetch_detached_path_complete_;
00936
00939 bool parsing_;
00940
00944 WaitMode waiting_;
00945
00950 bool fully_rewrite_on_flush_;
00951
00954 bool cleanup_on_fetch_complete_;
00955
00956 bool flush_requested_;
00957 bool flush_occurred_;
00958
00960 bool release_driver_;
00961
00962 scoped_ptr<AbstractMutex> inhibits_mutex_;
00963 typedef std::set <const HtmlElement*> ConstHtmlElementSet;
00964 ConstHtmlElementSet end_elements_inhibited_;
00965 HtmlEventList deferred_queue_;
00966 Function* finish_parse_on_hold_;
00967 HtmlEvent* inhibiting_event_;
00968 bool flush_in_progress_;
00969 bool uninhibit_reflush_requested_;
00970
00978 int rewrites_to_delete_;
00979
00982 GoogleUrl base_url_;
00983
00987 GoogleUrl decoded_base_url_;
00988
00989 GoogleString user_agent_;
00991 mutable LazyBool user_agent_is_bot_;
00992 mutable LazyBool user_agent_supports_image_inlining_;
00993 mutable LazyBool user_agent_supports_js_defer_;
00994 mutable LazyBool user_agent_supports_webp_;
00995 mutable LazyBool is_mobile_user_agent_;
00996
00998 bool using_spdy_;
00999
01000 StringFilterMap resource_filter_map_;
01001
01002 ResponseHeaders* response_headers_;
01003 const RequestHeaders* request_headers_;
01004
01007 typedef std::vector<RewriteContext*> RewriteContextVector;
01008 RewriteContextVector rewrites_;
01009 int rewrite_deadline_ms_;
01010
01011 typedef std::set<RewriteContext*> RewriteContextSet;
01012
01017 RewriteContextSet initiated_rewrites_;
01018
01026 RewriteContextSet detached_rewrites_;
01027
01032 int pending_rewrites_;
01033
01035 int possibly_quick_rewrites_;
01036
01041 int pending_async_events_;
01042
01045 FileSystem* file_system_;
01046 ResourceManager* resource_manager_;
01047 Scheduler* scheduler_;
01048 UrlAsyncFetcher* default_url_async_fetcher_;
01049
01053 UrlAsyncFetcher* url_async_fetcher_;
01054
01057 std::vector<UrlAsyncFetcher*> owned_url_async_fetchers_;
01058
01059 AddInstrumentationFilter* add_instrumentation_filter_;
01060 scoped_ptr<HtmlWriterFilter> html_writer_filter_;
01061
01062 ScanFilter scan_filter_;
01063 scoped_ptr<DomainRewriteFilter> domain_rewriter_;
01064 scoped_ptr<UrlLeftTrimFilter> url_trim_filter_;
01065
01067 typedef std::map<GoogleString, ResourcePtr> ResourceMap;
01068 ResourceMap resource_map_;
01069
01072 typedef std::map<GoogleString, RewriteContext*> PrimaryRewriteContextMap;
01073 PrimaryRewriteContextMap primary_rewrite_context_map_;
01074
01075 HtmlResourceSlotSet slots_;
01076
01077 scoped_ptr<RewriteOptions> options_;
01078 bool has_custom_options_;
01079
01081 UrlSegmentEncoder default_encoder_;
01082
01084 FilterList early_pre_render_filters_;
01086 FilterList pre_render_filters_;
01087
01091 FilterVector filters_to_delete_;
01092
01093 QueuedWorkerPool::Sequence* html_worker_;
01094 QueuedWorkerPool::Sequence* rewrite_worker_;
01095 QueuedWorkerPool::Sequence* low_priority_rewrite_worker_;
01096
01097 Writer* writer_;
01098
01100 GoogleString client_id_;
01101
01103 scoped_ptr<AbstractClientState> client_state_;
01104
01106 scoped_ptr<PropertyPage> property_page_;
01107
01109 scoped_ptr<StringSet> critical_images_;
01110
01112 bool need_furious_cookie_;
01113
01116 int num_inline_preview_images_;
01117
01118 DISALLOW_COPY_AND_ASSIGN(RewriteDriver);
01119 };
01120
01123 class OptionsAwareHTTPCacheCallback : public HTTPCache::Callback {
01124 public:
01125 virtual ~OptionsAwareHTTPCacheCallback();
01126 virtual bool IsCacheValid(const ResponseHeaders& headers);
01127
01128 protected:
01129 explicit OptionsAwareHTTPCacheCallback(const RewriteOptions* rewrite_options);
01130
01131 private:
01132 int64 cache_invalidation_timestamp_ms_;
01133 DISALLOW_COPY_AND_ASSIGN(OptionsAwareHTTPCacheCallback);
01134 };
01135
01136 }
01137
01138 #endif ///< NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_