2012-04-22 14:27:16 +00:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<!--
|
|
|
|
HERITRIX 3 CRAWL JOB CONFIGURATION FILE
|
|
|
|
|
|
|
|
This is a relatively minimal configuration suitable for many crawls.
|
|
|
|
|
|
|
|
Commented-out beans and properties are provided as an example; values
|
|
|
|
shown in comments reflect the actual defaults which are in effect
|
|
|
|
if not otherwise specified specification. (To change from the default
|
|
|
|
behavior, uncomment AND alter the shown values.)
|
|
|
|
-->
|
|
|
|
<beans xmlns="http://www.springframework.org/schema/beans"
|
|
|
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
|
|
xmlns:context="http://www.springframework.org/schema/context"
|
|
|
|
xmlns:aop="http://www.springframework.org/schema/aop"
|
|
|
|
xmlns:tx="http://www.springframework.org/schema/tx"
|
|
|
|
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
|
|
|
|
http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd
|
|
|
|
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd
|
|
|
|
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">
|
|
|
|
|
|
|
|
<context:annotation-config/>
|
|
|
|
|
|
|
|
<!--
|
|
|
|
OVERRIDES
|
|
|
|
Values elsewhere in the configuration may be replaced ('overridden')
|
|
|
|
by a Properties map declared in a PropertiesOverrideConfigurer,
|
|
|
|
using a dotted-bean-path to address individual bean properties.
|
|
|
|
This allows us to collect a few of the most-often changed values
|
|
|
|
in an easy-to-edit format here at the beginning of the model
|
|
|
|
configuration.
|
|
|
|
-->
|
|
|
|
<!-- overrides from a text property list -->
|
|
|
|
<bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
|
|
|
|
<property name="properties">
|
|
|
|
<value>
|
|
|
|
# This Properties map is specified in the Java 'property list' text format
|
|
|
|
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
|
|
|
|
|
|
|
|
metadata.operatorContactUrl=http://owark.org
|
|
|
|
metadata.jobName=basic
|
|
|
|
metadata.description=Basic crawl starting with useful defaults
|
|
|
|
|
|
|
|
##..more?..##
|
|
|
|
</value>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- overrides from declared <prop> elements, more easily allowing
|
|
|
|
multiline values or even declared beans -->
|
|
|
|
<bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
|
|
|
|
<property name="properties">
|
|
|
|
<props>
|
|
|
|
<prop key="seeds.textSource.value">
|
|
|
|
|
|
|
|
# URLS HERE
|
|
|
|
<url xmlns=""/>
|
|
|
|
|
|
|
|
</prop>
|
|
|
|
</props>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- CRAWL METADATA: including identification of crawler/operator -->
|
|
|
|
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName">
|
|
|
|
<property name="operatorContactUrl" value="[see override above]"/>
|
|
|
|
<property name="jobName" value="[see override above]"/>
|
|
|
|
<property name="description" value="[see override above]"/>
|
|
|
|
<!-- <property name="robotsPolicyName" value="obey"/> -->
|
|
|
|
<!-- <property name="operator" value=""/> -->
|
|
|
|
<!-- <property name="operatorFrom" value=""/> -->
|
|
|
|
<!-- <property name="organization" value=""/> -->
|
|
|
|
<!-- <property name="audience" value=""/> -->
|
|
|
|
<!-- <property name="userAgentTemplate"
|
|
|
|
value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->
|
|
|
|
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- SEEDS: crawl starting points
|
|
|
|
ConfigString allows simple, inline specification of a moderate
|
|
|
|
number of seeds; see below comment for example of using an
|
|
|
|
arbitrarily-large external file. -->
|
|
|
|
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
|
|
|
|
<property name="textSource">
|
|
|
|
<bean class="org.archive.spring.ConfigString">
|
|
|
|
<property name="value">
|
|
|
|
<value>
|
|
|
|
# [see override above]
|
|
|
|
</value>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
</property>
|
|
|
|
<!-- <property name='sourceTagSeeds' value='false'/> -->
|
|
|
|
<!-- <property name='blockAwaitingSeedLines' value='-1'/> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
|
|
|
|
the job directory, similar to the H1 approach.
|
|
|
|
Use either the above, or this, but not both. -->
|
|
|
|
<!--
|
|
|
|
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
|
|
|
|
<property name="textSource">
|
|
|
|
<bean class="org.archive.spring.ConfigFile">
|
|
|
|
<property name="path" value="seeds.txt" />
|
|
|
|
</bean>
|
|
|
|
</property>
|
|
|
|
<property name='sourceTagSeeds' value='false'/>
|
|
|
|
<property name='blockAwaitingSeedLines' value='-1'/>
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- SCOPE: rules for which discovered URIs to crawl; order is very
|
|
|
|
important because last decision returned other than 'NONE' wins. -->
|
|
|
|
<bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">
|
|
|
|
<!-- <property name="logToFile" value="false" /> -->
|
|
|
|
<property name="rules">
|
|
|
|
<list>
|
|
|
|
<!-- Begin by REJECTing all... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.RejectDecideRule">
|
|
|
|
</bean>
|
|
|
|
<!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
|
|
|
|
<!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
|
|
|
|
<!-- <property name="alsoCheckVia" value="false" /> -->
|
|
|
|
<!-- <property name="surtsSourceFile" value="" /> -->
|
|
|
|
<!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> -->
|
|
|
|
<!-- <property name="surtsSource">
|
|
|
|
<bean class="org.archive.spring.ConfigString">
|
|
|
|
<property name="value">
|
|
|
|
<value>
|
|
|
|
# example.com
|
|
|
|
# http://www.example.edu/path1/
|
|
|
|
# +http://(org,example,
|
|
|
|
</value>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
</property> -->
|
|
|
|
</bean>
|
|
|
|
<!-- ...but REJECT those more than a configured link-hop-count from start... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">
|
|
|
|
<property name="maxHops" value="0" />
|
|
|
|
</bean>
|
|
|
|
<!-- ...but ACCEPT those more than a configured link-hop-count from start... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.TransclusionDecideRule">
|
|
|
|
<!-- <property name="maxTransHops" value="2" /> -->
|
|
|
|
<!-- <property name="maxSpeculativeHops" value="1" /> -->
|
|
|
|
</bean>
|
|
|
|
<!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
|
|
|
|
<property name="decision" value="REJECT"/>
|
|
|
|
<property name="seedsAsSurtPrefixes" value="false"/>
|
|
|
|
<property name="surtsDumpFile" value="${launchId}/negative-surts.dump" />
|
|
|
|
<!-- <property name="surtsSource">
|
|
|
|
<bean class="org.archive.spring.ConfigFile">
|
|
|
|
<property name="path" value="negative-surts.txt" />
|
|
|
|
</bean>
|
|
|
|
</property> -->
|
|
|
|
</bean>
|
|
|
|
<!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
|
|
|
|
<property name="decision" value="REJECT"/>
|
|
|
|
<!-- <property name="listLogicalOr" value="true" /> -->
|
|
|
|
<!-- <property name="regexList">
|
|
|
|
<list>
|
|
|
|
</list>
|
|
|
|
</property> -->
|
|
|
|
</bean>
|
|
|
|
<!-- ...and REJECT those with suspicious repeating path-segments... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">
|
|
|
|
<!-- <property name="maxRepetitions" value="2" /> -->
|
|
|
|
</bean>
|
|
|
|
<!-- ...and REJECT those with more than threshold number of path-segments... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
|
|
|
|
<!-- <property name="maxPathDepth" value="20" /> -->
|
|
|
|
</bean>
|
|
|
|
<!-- ...but always ACCEPT those marked as prerequisitee for another URI... -->
|
|
|
|
<bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
|
|
|
|
</bean>
|
|
|
|
<!-- ...but always REJECT those with unsupported URI schemes -->
|
|
|
|
<bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
|
|
|
|
</bean>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!--
|
|
|
|
PROCESSING CHAINS
|
|
|
|
Much of the crawler's work is specified by the sequential
|
|
|
|
application of swappable Processor modules. These Processors
|
|
|
|
are collected into three 'chains'. The CandidateChain is applied
|
|
|
|
to URIs being considered for inclusion, before a URI is enqueued
|
|
|
|
for collection. The FetchChain is applied to URIs when their
|
|
|
|
turn for collection comes up. The DispositionChain is applied
|
|
|
|
after a URI is fetched and analyzed/link-extracted.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- CANDIDATE CHAIN -->
|
|
|
|
<!-- first, processors are declared as top-level named beans -->
|
|
|
|
<bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
|
|
|
|
</bean>
|
|
|
|
<bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
|
|
|
|
<!-- <property name="preferenceDepthHops" value="-1" /> -->
|
|
|
|
<!-- <property name="preferenceEmbedHops" value="1" /> -->
|
|
|
|
<!-- <property name="canonicalizationPolicy">
|
|
|
|
<ref bean="canonicalizationPolicy" />
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="queueAssignmentPolicy">
|
|
|
|
<ref bean="queueAssignmentPolicy" />
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="uriPrecedencePolicy">
|
|
|
|
<ref bean="uriPrecedencePolicy" />
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="costAssignmentPolicy">
|
|
|
|
<ref bean="costAssignmentPolicy" />
|
|
|
|
</property> -->
|
|
|
|
</bean>
|
|
|
|
<!-- now, processors are assembled into ordered CandidateChain bean -->
|
|
|
|
<bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
|
|
|
|
<property name="processors">
|
|
|
|
<list>
|
|
|
|
<!-- apply scoping rules to each individual candidate URI... -->
|
|
|
|
<ref bean="candidateScoper"/>
|
|
|
|
<!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
|
|
|
|
<ref bean="preparer"/>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- FETCH CHAIN -->
|
|
|
|
<!-- first, processors are declared as top-level named beans -->
|
|
|
|
<bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
|
|
|
|
<!-- <property name="recheckScope" value="false" /> -->
|
|
|
|
<!-- <property name="blockAll" value="false" /> -->
|
|
|
|
<!-- <property name="blockByRegex" value="" /> -->
|
|
|
|
<!-- <property name="allowByRegex" value="" /> -->
|
|
|
|
</bean>
|
|
|
|
<bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">
|
|
|
|
<!-- <property name="ipValidityDurationSeconds" value="21600" /> -->
|
|
|
|
<!-- <property name="robotsValidityDurationSeconds" value="86400" /> -->
|
|
|
|
<!-- <property name="calculateRobotsOnly" value="false" /> -->
|
|
|
|
</bean>
|
|
|
|
<bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">
|
|
|
|
<!-- <property name="acceptNonDnsResolves" value="false" /> -->
|
|
|
|
<!-- <property name="digestContent" value="true" /> -->
|
|
|
|
<!-- <property name="digestAlgorithm" value="sha1" /> -->
|
|
|
|
</bean>
|
|
|
|
<!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois">
|
|
|
|
<property name="specialQueryTemplates">
|
|
|
|
<map>
|
|
|
|
<entry key="whois.verisign-grs.com" value="domain %s" />
|
|
|
|
<entry key="whois.arin.net" value="z + %s" />
|
|
|
|
<entry key="whois.denic.de" value="-T dn %s" />
|
|
|
|
</map>
|
|
|
|
</property>
|
|
|
|
</bean> -->
|
|
|
|
<bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">
|
|
|
|
<!-- <property name="useHTTP11" value="false" /> -->
|
|
|
|
<!-- <property name="maxLengthBytes" value="0" /> -->
|
|
|
|
<!-- <property name="timeoutSeconds" value="1200" /> -->
|
|
|
|
<!-- <property name="maxFetchKBSec" value="0" /> -->
|
|
|
|
<!-- <property name="defaultEncoding" value="ISO-8859-1" /> -->
|
|
|
|
<!-- <property name="shouldFetchBodyRule">
|
|
|
|
<bean class="org.archive.modules.deciderules.AcceptDecideRule"/>
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="soTimeoutMs" value="20000" /> -->
|
|
|
|
<!-- <property name="sendIfModifiedSince" value="true" /> -->
|
|
|
|
<!-- <property name="sendIfNoneMatch" value="true" /> -->
|
|
|
|
<!-- <property name="sendConnectionClose" value="true" /> -->
|
|
|
|
<!-- <property name="sendReferer" value="true" /> -->
|
|
|
|
<!-- <property name="sendRange" value="false" /> -->
|
|
|
|
<!-- <property name="ignoreCookies" value="false" /> -->
|
|
|
|
<!-- <property name="sslTrustLevel" value="OPEN" /> -->
|
|
|
|
<!-- <property name="acceptHeaders">
|
|
|
|
<list>
|
|
|
|
<value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
-->
|
|
|
|
<!-- <property name="httpBindAddress" value="" /> -->
|
|
|
|
<!-- <property name="httpProxyHost" value="" /> -->
|
|
|
|
<!-- <property name="httpProxyPort" value="0" /> -->
|
|
|
|
<!-- <property name="httpProxyUser" value="" /> -->
|
|
|
|
<!-- <property name="httpProxyPassword" value="" /> -->
|
|
|
|
<!-- <property name="digestContent" value="true" /> -->
|
|
|
|
<!-- <property name="digestAlgorithm" value="sha1" /> -->
|
|
|
|
</bean>
|
|
|
|
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
|
|
|
|
</bean>
|
|
|
|
<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
|
|
|
|
<!-- <property name="extractJavascript" value="true" /> -->
|
|
|
|
<!-- <property name="extractValueAttributes" value="true" /> -->
|
|
|
|
<!-- <property name="ignoreFormActionUrls" value="false" /> -->
|
|
|
|
<!-- <property name="extractOnlyFormGets" value="true" /> -->
|
|
|
|
<!-- <property name="treatFramesAsEmbedLinks" value="true" /> -->
|
|
|
|
<!-- <property name="ignoreUnexpectedHtml" value="true" /> -->
|
|
|
|
<!-- <property name="maxElementLength" value="1024" /> -->
|
|
|
|
<!-- <property name="maxAttributeNameLength" value="1024" /> -->
|
|
|
|
<!-- <property name="maxAttributeValueLength" value="16384" /> -->
|
|
|
|
</bean>
|
|
|
|
<bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
|
|
|
|
</bean>
|
|
|
|
<bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">
|
|
|
|
</bean>
|
|
|
|
<bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
|
|
|
|
</bean>
|
|
|
|
<!-- now, processors are assembled into ordered FetchChain bean -->
|
|
|
|
<bean id="fetchProcessors" class="org.archive.modules.FetchChain">
|
|
|
|
<property name="processors">
|
|
|
|
<list>
|
|
|
|
<!-- re-check scope, if so enabled... -->
|
|
|
|
<ref bean="preselector"/>
|
|
|
|
<!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
|
|
|
|
<ref bean="preconditions"/>
|
|
|
|
<!-- ...fetch if DNS URI... -->
|
|
|
|
<ref bean="fetchDns"/>
|
|
|
|
<!-- <ref bean="fetchWhois"/> -->
|
|
|
|
<!-- ...fetch if HTTP URI... -->
|
|
|
|
<ref bean="fetchHttp"/>
|
|
|
|
<!-- ...extract outlinks from HTTP headers... -->
|
|
|
|
<ref bean="extractorHttp"/>
|
|
|
|
<!-- ...extract outlinks from HTML content... -->
|
|
|
|
<ref bean="extractorHtml"/>
|
|
|
|
<!-- ...extract outlinks from CSS content... -->
|
|
|
|
<ref bean="extractorCss"/>
|
|
|
|
<!-- ...extract outlinks from Javascript content... -->
|
|
|
|
<ref bean="extractorJs"/>
|
|
|
|
<!-- ...extract outlinks from Flash content... -->
|
|
|
|
<ref bean="extractorSwf"/>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- DISPOSITION CHAIN -->
|
|
|
|
<!-- first, processors are declared as top-level named beans -->
|
|
|
|
<bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
|
2012-04-22 15:59:39 +00:00
|
|
|
<property name="compress" value="false" />
|
2012-04-22 14:27:16 +00:00
|
|
|
<!-- <property name="prefix" value="IAH" /> -->
|
|
|
|
<!-- <property name="suffix" value="${HOSTNAME}" /> -->
|
|
|
|
<!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
|
|
|
|
<!-- <property name="poolMaxActive" value="1" /> -->
|
|
|
|
<!-- <property name="MaxWaitForIdleMs" value="500" /> -->
|
|
|
|
<!-- <property name="skipIdenticalDigests" value="false" /> -->
|
|
|
|
<!-- <property name="maxTotalBytesToWrite" value="0" /> -->
|
|
|
|
<!-- <property name="directory" value="${launchId}" /> -->
|
|
|
|
<!-- <property name="storePaths">
|
|
|
|
<list>
|
|
|
|
<value>warcs</value>
|
|
|
|
</list>
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="writeRequests" value="true" /> -->
|
|
|
|
<!-- <property name="writeMetadata" value="true" /> -->
|
|
|
|
<!-- <property name="writeRevisitForIdenticalDigests" value="true" /> -->
|
|
|
|
<!-- <property name="writeRevisitForNotModified" value="true" /> -->
|
|
|
|
</bean>
|
|
|
|
<bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
|
|
|
|
<!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
|
|
|
|
</bean>
|
|
|
|
<bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">
|
|
|
|
<!-- <property name="delayFactor" value="5.0" /> -->
|
|
|
|
<!-- <property name="minDelayMs" value="3000" /> -->
|
|
|
|
<!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> -->
|
|
|
|
<!-- <property name="maxDelayMs" value="30000" /> -->
|
|
|
|
<!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
|
|
|
|
</bean>
|
|
|
|
<!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor">
|
|
|
|
<property name="rescheduleDelaySeconds" value="-1" />
|
|
|
|
</bean> -->
|
|
|
|
<!-- now, processors are assembled into ordered DispositionChain bean -->
|
|
|
|
<bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
|
|
|
|
<property name="processors">
|
|
|
|
<list>
|
|
|
|
<!-- write to aggregate archival files... -->
|
|
|
|
<ref bean="warcWriter"/>
|
|
|
|
<!-- ...send each outlink candidate URI to CandidateChain,
|
|
|
|
and enqueue those ACCEPTed to the frontier... -->
|
|
|
|
<ref bean="candidates"/>
|
|
|
|
<!-- ...then update stats, shared-structures, frontier decisions -->
|
|
|
|
<ref bean="disposition"/>
|
|
|
|
<!-- <ref bean="rescheduler" /> -->
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- CRAWLCONTROLLER: Control interface, unifying context -->
|
|
|
|
<bean id="crawlController"
|
|
|
|
class="org.archive.crawler.framework.CrawlController">
|
|
|
|
<!-- <property name="maxToeThreads" value="25" /> -->
|
|
|
|
<!-- <property name="pauseAtStart" value="true" /> -->
|
|
|
|
<!-- <property name="runWhileEmpty" value="false" /> -->
|
|
|
|
<!-- <property name="recorderInBufferBytes" value="524288" /> -->
|
|
|
|
<!-- <property name="recorderOutBufferBytes" value="16384" /> -->
|
|
|
|
<!-- <property name="scratchDir" value="scratch" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->
|
|
|
|
<bean id="frontier"
|
|
|
|
class="org.archive.crawler.frontier.BdbFrontier">
|
|
|
|
<!-- <property name="queueTotalBudget" value="-1" /> -->
|
|
|
|
<!-- <property name="balanceReplenishAmount" value="3000" /> -->
|
|
|
|
<!-- <property name="errorPenaltyAmount" value="100" /> -->
|
|
|
|
<!-- <property name="precedenceFloor" value="255" /> -->
|
|
|
|
<!-- <property name="queuePrecedencePolicy">
|
|
|
|
<bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" />
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="snoozeLongMs" value="300000" /> -->
|
|
|
|
<!-- <property name="retryDelaySeconds" value="900" /> -->
|
|
|
|
<!-- <property name="maxRetries" value="30" /> -->
|
|
|
|
<!-- <property name="recoveryLogEnabled" value="true" /> -->
|
|
|
|
<!-- <property name="maxOutlinks" value="6000" /> -->
|
|
|
|
<!-- <property name="extractIndependently" value="false" /> -->
|
|
|
|
<!-- <property name="outbound">
|
|
|
|
<bean class="java.util.concurrent.ArrayBlockingQueue">
|
|
|
|
<constructor-arg value="200"/>
|
|
|
|
<constructor-arg value="true"/>
|
|
|
|
</bean>
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="inbound">
|
|
|
|
<bean class="java.util.concurrent.ArrayBlockingQueue">
|
|
|
|
<constructor-arg value="40000"/>
|
|
|
|
<constructor-arg value="true"/>
|
|
|
|
</bean>
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="dumpPendingAtClose" value="false" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs -->
|
|
|
|
<bean id="uriUniqFilter"
|
|
|
|
class="org.archive.crawler.util.BdbUriUniqFilter">
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!--
|
|
|
|
EXAMPLE SETTINGS OVERLAY SHEETS
|
|
|
|
Sheets allow some settings to vary by context - usually by URI context,
|
|
|
|
so that different sites or sections of sites can be treated differently.
|
|
|
|
Here are some example Sheets for common purposes. The SheetOverlaysManager
|
|
|
|
(below) automatically collects all Sheet instances declared among the
|
|
|
|
original beans, but others can be added during the crawl via the scripting
|
|
|
|
interface.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- forceRetire: any URI to which this sheet's settings are applied
|
|
|
|
will force its containing queue to 'retired' status. -->
|
|
|
|
<bean id='forceRetire' class='org.archive.spring.Sheet'>
|
|
|
|
<property name='map'>
|
|
|
|
<map>
|
|
|
|
<entry key='disposition.forceRetire' value='true'/>
|
|
|
|
</map>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- smallBudget: any URI to which this sheet's settings are applied
|
|
|
|
will give its containing queue small values for balanceReplenishAmount
|
|
|
|
(causing it to have shorter 'active' periods while other queues are
|
|
|
|
waiting) and queueTotalBudget (causing the queue to enter 'retired'
|
|
|
|
status once that expenditure is reached by URI attempts and errors) -->
|
|
|
|
<bean id='smallBudget' class='org.archive.spring.Sheet'>
|
|
|
|
<property name='map'>
|
|
|
|
<map>
|
|
|
|
<entry key='frontier.balanceReplenishAmount' value='20'/>
|
|
|
|
<entry key='frontier.queueTotalBudget' value='100'/>
|
|
|
|
</map>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- veryPolite: any URI to which this sheet's settings are applied
|
|
|
|
will cause its queue to take extra-long politeness snoozes -->
|
|
|
|
<bean id='veryPolite' class='org.archive.spring.Sheet'>
|
|
|
|
<property name='map'>
|
|
|
|
<map>
|
|
|
|
<entry key='disposition.delayFactor' value='10'/>
|
|
|
|
<entry key='disposition.minDelayMs' value='10000'/>
|
|
|
|
<entry key='disposition.maxDelayMs' value='1000000'/>
|
|
|
|
<entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/>
|
|
|
|
</map>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- highPrecedence: any URI to which this sheet's settings are applied
|
|
|
|
will give its containing queue a slightly-higher than default
|
|
|
|
queue precedence value. That queue will then be preferred over
|
|
|
|
other queues for active crawling, never waiting behind lower-
|
|
|
|
precedence queues. -->
|
|
|
|
<bean id='highPrecedence' class='org.archive.spring.Sheet'>
|
|
|
|
<property name='map'>
|
|
|
|
<map>
|
|
|
|
<entry key='frontier.balanceReplenishAmount' value='20'/>
|
|
|
|
<entry key='frontier.queueTotalBudget' value='100'/>
|
|
|
|
</map>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!--
|
|
|
|
EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
|
|
|
|
A SheetAssociation says certain URIs should have certain overlay Sheets
|
|
|
|
applied. This example applies two sheets to URIs matching two SURT-prefixes.
|
|
|
|
New associations may also be added mid-crawl using the scripting facility.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!--
|
|
|
|
<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
|
|
|
|
<property name='surtPrefixes'>
|
|
|
|
<list>
|
|
|
|
<value>http://(org,example,</value>
|
|
|
|
<value>http://(com,example,www,)/</value>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
<property name='targetSheetNames'>
|
|
|
|
<list>
|
|
|
|
<value>veryPolite</value>
|
|
|
|
<value>smallBudget</value>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!--
|
|
|
|
OPTIONAL BUT RECOMMENDED BEANS
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- ACTIONDIRECTORY: disk directory for mid-crawl operations
|
|
|
|
Running job will watch directory for new files with URIs,
|
|
|
|
scripts, and other data to be processed during a crawl. -->
|
|
|
|
<bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">
|
|
|
|
<!-- <property name="actionDir" value="action" /> -->
|
|
|
|
<!-- <property name="doneDir" value="${launchId}/actions-done" /> -->
|
|
|
|
<!-- <property name="initialDelaySeconds" value="10" /> -->
|
|
|
|
<!-- <property name="delaySeconds" value="30" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->
|
|
|
|
<bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">
|
|
|
|
<!-- <property name="maxBytesDownload" value="0" /> -->
|
|
|
|
<!-- <property name="maxDocumentsDownload" value="0" /> -->
|
|
|
|
<!-- <property name="maxTimeSeconds" value="0" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- CHECKPOINTSERVICE: checkpointing assistance -->
|
|
|
|
<bean id="checkpointService"
|
|
|
|
class="org.archive.crawler.framework.CheckpointService">
|
|
|
|
<!-- <property name="checkpointIntervalMinutes" value="-1"/> -->
|
|
|
|
<!-- <property name="checkpointsDir" value="checkpoints"/> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!--
|
|
|
|
OPTIONAL BEANS
|
|
|
|
Uncomment and expand as needed, or if non-default alternate
|
|
|
|
implementations are preferred.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- CANONICALIZATION POLICY -->
|
|
|
|
<!--
|
|
|
|
<bean id="canonicalizationPolicy"
|
|
|
|
class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy">
|
|
|
|
<property name="rules">
|
|
|
|
<list>
|
|
|
|
<bean class="org.archive.modules.canonicalize.LowercaseRule" />
|
|
|
|
<bean class="org.archive.modules.canonicalize.StripUserinfoRule" />
|
|
|
|
<bean class="org.archive.modules.canonicalize.StripWWWNRule" />
|
|
|
|
<bean class="org.archive.modules.canonicalize.StripSessionIDs" />
|
|
|
|
<bean class="org.archive.modules.canonicalize.StripSessionCFIDs" />
|
|
|
|
<bean class="org.archive.modules.canonicalize.FixupQueryString" />
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
|
|
|
|
<!-- QUEUE ASSIGNMENT POLICY -->
|
|
|
|
<!--
|
|
|
|
<bean id="queueAssignmentPolicy"
|
|
|
|
class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy">
|
|
|
|
<property name="forceQueueAssignment" value="" />
|
|
|
|
<property name="deferToPrevious" value="true" />
|
|
|
|
<property name="parallelQueues" value="1" />
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- URI PRECEDENCE POLICY -->
|
|
|
|
<!--
|
|
|
|
<bean id="uriPrecedencePolicy"
|
|
|
|
class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- COST ASSIGNMENT POLICY -->
|
|
|
|
<!--
|
|
|
|
<bean id="costAssignmentPolicy"
|
|
|
|
class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials -->
|
|
|
|
<!--
|
|
|
|
<bean id="credentialStore"
|
|
|
|
class="org.archive.modules.credential.CredentialStore">
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- DISK SPACE MONITOR:
|
|
|
|
Pauses the crawl if disk space at monitored paths falls below minimum threshold -->
|
|
|
|
<!--
|
|
|
|
<bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor">
|
|
|
|
<property name="pauseThresholdMiB" value="500" />
|
|
|
|
<property name="monitorConfigPaths" value="true" />
|
|
|
|
<property name="monitorPaths">
|
|
|
|
<list>
|
|
|
|
<value>PATH</value>
|
|
|
|
</list>
|
|
|
|
</property>
|
|
|
|
</bean>
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!--
|
|
|
|
REQUIRED STANDARD BEANS
|
|
|
|
It will be very rare to replace or reconfigure the following beans.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<!-- STATISTICSTRACKER: standard stats/reporting collector -->
|
|
|
|
<bean id="statisticsTracker"
|
|
|
|
class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">
|
|
|
|
<!-- <property name="reports">
|
|
|
|
<list>
|
|
|
|
<bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" />
|
|
|
|
<bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" />
|
|
|
|
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" />
|
|
|
|
<bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" />
|
|
|
|
<bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" />
|
|
|
|
<bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" />
|
|
|
|
<bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" />
|
|
|
|
<bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" />
|
|
|
|
<bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" />
|
|
|
|
<bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" />
|
|
|
|
</list>
|
|
|
|
</property> -->
|
|
|
|
<!-- <property name="reportsDir" value="${launchId}/reports" /> -->
|
|
|
|
<!-- <property name="liveHostReportSize" value="20" /> -->
|
|
|
|
<!-- <property name="intervalSeconds" value="20" /> -->
|
|
|
|
<!-- <property name="keepSnapshotsCount" value="5" /> -->
|
|
|
|
<!-- <property name="liveHostReportSize" value="20" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- CRAWLERLOGGERMODULE: shared logging facility -->
|
|
|
|
<bean id="loggerModule"
|
|
|
|
class="org.archive.crawler.reporting.CrawlerLoggerModule">
|
|
|
|
<!-- <property name="path" value="${launchId}/logs" /> -->
|
|
|
|
<!-- <property name="crawlLogPath" value="crawl.log" /> -->
|
|
|
|
<!-- <property name="alertsLogPath" value="alerts.log" /> -->
|
|
|
|
<!-- <property name="progressLogPath" value="progress-statistics.log" /> -->
|
|
|
|
<!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> -->
|
|
|
|
<!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> -->
|
|
|
|
<!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> -->
|
|
|
|
<!-- <property name="logExtraInfo" value="false" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
|
|
|
|
Autowired to include any SheetForSurtPrefix or
|
|
|
|
SheetForDecideRuled beans -->
|
|
|
|
<bean id="sheetOverlaysManager" autowire="byType"
|
|
|
|
class="org.archive.crawler.spring.SheetOverlaysManager">
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- BDBMODULE: shared BDB-JE disk persistence manager -->
|
|
|
|
<bean id="bdb"
|
|
|
|
class="org.archive.bdb.BdbModule">
|
|
|
|
<!-- <property name="dir" value="state" /> -->
|
|
|
|
<!-- <property name="cachePercent" value="60" /> -->
|
|
|
|
<!-- <property name="useSharedCache" value="true" /> -->
|
|
|
|
<!-- <property name="expectedConcurrency" value="25" /> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
|
|
|
|
<bean id="cookieStorage"
|
|
|
|
class="org.archive.modules.fetcher.BdbCookieStorage">
|
|
|
|
<!-- <property name="cookiesLoadFile"><null/></property> -->
|
|
|
|
<!-- <property name="cookiesSaveFile"><null/></property> -->
|
|
|
|
<!-- <property name="bdb">
|
|
|
|
<ref bean="bdb"/>
|
|
|
|
</property> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- SERVERCACHE: shared cache of server/host info -->
|
|
|
|
<bean id="serverCache"
|
|
|
|
class="org.archive.modules.net.BdbServerCache">
|
|
|
|
<!-- <property name="bdb">
|
|
|
|
<ref bean="bdb"/>
|
|
|
|
</property> -->
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
<!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative
|
|
|
|
to crawler-beans.cxml file, and tracking crawl files for web UI -->
|
|
|
|
<bean id="configPathConfigurer"
|
|
|
|
class="org.archive.spring.ConfigPathConfigurer">
|
|
|
|
</bean>
|
|
|
|
|
|
|
|
</beans>
|