Now building and launching Heritrix jobs...

2012-04-21 23:42:16 +02:00 · 2012-04-21 23:42:16 +02:00 · 57daa703da
parent be2f974a4c
commit 57daa703da
2 changed files with 755 additions and 2 deletions
--- a/archiver/pipelines/actions/cxml.xslt
+++ b/archiver/pipelines/actions/cxml.xslt
@ -0,0 +1,695 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- 
+  HERITRIX 3 CRAWL JOB CONFIGURATION FILE
+  
+   This is a relatively minimal configuration suitable for many crawls.
+   
+   Commented-out beans and properties are provided as an example; values
+   shown in comments reflect the actual defaults which are in effect
+   if not otherwise specified specification. (To change from the default 
+   behavior, uncomment AND alter the shown values.)   
+ -->
+<beans xsl:version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns="http://www.springframework.org/schema/beans"
+	     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xmlns:context="http://www.springframework.org/schema/context"
+	     xmlns:aop="http://www.springframework.org/schema/aop"
+	     xmlns:tx="http://www.springframework.org/schema/tx"
+	     xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
+           http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd
+           http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd
+           http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">
+ 
+ <context:annotation-config/>
+
+<!-- 
+  OVERRIDES
+   Values elsewhere in the configuration may be replaced ('overridden') 
+   by a Properties map declared in a PropertiesOverrideConfigurer, 
+   using a dotted-bean-path to address individual bean properties. 
+   This allows us to collect a few of the most-often changed values
+   in an easy-to-edit format here at the beginning of the model
+   configuration.    
+ -->
+ <!-- overrides from a text property list -->
+ <bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
+  <property name="properties">
+   <value>
+# This Properties map is specified in the Java 'property list' text format
+# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
+
+metadata.operatorContactUrl=http://owark.org
+metadata.jobName=basic
+metadata.description=Basic crawl starting with useful defaults
+
+##..more?..##
+   </value>
+  </property>
+ </bean>
+
+ <!-- overrides from declared <prop> elements, more easily allowing
+      multiline values or even declared beans -->
+ <bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
+  <property name="properties">
+   <props>
+    <prop key="seeds.textSource.value">
+
+    <xsl:value-of select="/action/@url"/>
+
+    </prop>
+   </props>
+  </property>
+ </bean>
+
+ <!-- CRAWL METADATA: including identification of crawler/operator -->
+ <bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName">
+       <property name="operatorContactUrl" value="[see override above]"/>
+       <property name="jobName" value="[see override above]"/>
+       <property name="description" value="[see override above]"/>
+  <!-- <property name="robotsPolicyName" value="obey"/> -->
+  <!-- <property name="operator" value=""/> -->
+  <!-- <property name="operatorFrom" value=""/> -->
+  <!-- <property name="organization" value=""/> -->
+  <!-- <property name="audience" value=""/> -->
+  <!-- <property name="userAgentTemplate" 
+         value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->
+       
+ </bean>
+ 
+ <!-- SEEDS: crawl starting points 
+      ConfigString allows simple, inline specification of a moderate
+      number of seeds; see below comment for example of using an
+      arbitrarily-large external file. -->
+ <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
+     <property name="textSource">
+      <bean class="org.archive.spring.ConfigString">
+       <property name="value">
+        <value>
+# [see override above]
+        </value>
+       </property>
+      </bean>
+     </property>
+<!-- <property name='sourceTagSeeds' value='false'/> -->
+<!-- <property name='blockAwaitingSeedLines' value='-1'/> -->
+ </bean>
+ 
+ <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
+      the job directory, similar to the H1 approach. 
+      Use either the above, or this, but not both. -->
+ <!-- 
+ <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
+  <property name="textSource">
+   <bean class="org.archive.spring.ConfigFile">
+    <property name="path" value="seeds.txt" />
+   </bean>
+  </property>
+  <property name='sourceTagSeeds' value='false'/>
+  <property name='blockAwaitingSeedLines' value='-1'/>
+ </bean>
+  -->
+ 
+ <!-- SCOPE: rules for which discovered URIs to crawl; order is very 
+      important because last decision returned other than 'NONE' wins. -->
+ <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">
+  <!-- <property name="logToFile" value="false" /> -->
+  <property name="rules">
+   <list>
+    <!-- Begin by REJECTing all... -->
+    <bean class="org.archive.modules.deciderules.RejectDecideRule">
+    </bean>
+    <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
+    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
+     <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
+     <!-- <property name="alsoCheckVia" value="false" /> -->
+     <!-- <property name="surtsSourceFile" value="" /> -->
+     <!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> -->
+     <!-- <property name="surtsSource">
+           <bean class="org.archive.spring.ConfigString">
+            <property name="value">
+             <value>
+              # example.com
+              # http://www.example.edu/path1/
+              # +http://(org,example,
+             </value>
+            </property> 
+           </bean>
+          </property> -->
+    </bean>
+    <!-- ...but REJECT those more than a configured link-hop-count from start... -->
+    <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">
+      <property name="maxHops" value="0" /> 
+    </bean>
+    <!-- ...but ACCEPT those more than a configured link-hop-count from start... -->
+    <bean class="org.archive.modules.deciderules.TransclusionDecideRule">
+     <!-- <property name="maxTransHops" value="2" /> -->
+     <!-- <property name="maxSpeculativeHops" value="1" /> -->
+    </bean>
+    <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->
+    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
+          <property name="decision" value="REJECT"/>
+          <property name="seedsAsSurtPrefixes" value="false"/>
+          <property name="surtsDumpFile" value="${launchId}/negative-surts.dump" /> 
+     <!-- <property name="surtsSource">
+           <bean class="org.archive.spring.ConfigFile">
+            <property name="path" value="negative-surts.txt" />
+           </bean>
+          </property> -->
+    </bean>
+    <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
+    <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
+          <property name="decision" value="REJECT"/>
+     <!-- <property name="listLogicalOr" value="true" /> -->
+     <!-- <property name="regexList">
+           <list>
+           </list>
+          </property> -->
+    </bean>
+    <!-- ...and REJECT those with suspicious repeating path-segments... -->
+    <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">
+     <!-- <property name="maxRepetitions" value="2" /> -->
+    </bean>
+    <!-- ...and REJECT those with more than threshold number of path-segments... -->
+    <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
+     <!-- <property name="maxPathDepth" value="20" /> -->
+    </bean>
+    <!-- ...but always ACCEPT those marked as prerequisitee for another URI... -->
+    <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
+    </bean>
+    <!-- ...but always REJECT those with unsupported URI schemes -->
+    <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
+    </bean>
+   </list>
+  </property>
+ </bean>
+ 
+ <!-- 
+   PROCESSING CHAINS
+    Much of the crawler's work is specified by the sequential 
+    application of swappable Processor modules. These Processors
+    are collected into three 'chains'. The CandidateChain is applied 
+    to URIs being considered for inclusion, before a URI is enqueued
+    for collection. The FetchChain is applied to URIs when their 
+    turn for collection comes up. The DispositionChain is applied 
+    after a URI is fetched and analyzed/link-extracted.
+  -->
+  
+ <!-- CANDIDATE CHAIN --> 
+ <!-- first, processors are declared as top-level named beans -->
+ <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
+ </bean>
+ <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
+  <!-- <property name="preferenceDepthHops" value="-1" /> -->
+  <!-- <property name="preferenceEmbedHops" value="1" /> -->
+  <!-- <property name="canonicalizationPolicy"> 
+        <ref bean="canonicalizationPolicy" />
+       </property> -->
+  <!-- <property name="queueAssignmentPolicy"> 
+        <ref bean="queueAssignmentPolicy" />
+       </property> -->
+  <!-- <property name="uriPrecedencePolicy"> 
+        <ref bean="uriPrecedencePolicy" />
+       </property> -->
+  <!-- <property name="costAssignmentPolicy"> 
+        <ref bean="costAssignmentPolicy" />
+       </property> -->
+ </bean>
+ <!-- now, processors are assembled into ordered CandidateChain bean -->
+ <bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
+  <property name="processors">
+   <list>
+    <!-- apply scoping rules to each individual candidate URI... -->
+    <ref bean="candidateScoper"/>
+    <!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
+    <ref bean="preparer"/>
+   </list>
+  </property>
+ </bean>
+  
+ <!-- FETCH CHAIN --> 
+ <!-- first, processors are declared as top-level named beans -->
+ <bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
+  <!-- <property name="recheckScope" value="false" /> -->
+  <!-- <property name="blockAll" value="false" /> -->
+  <!-- <property name="blockByRegex" value="" /> -->
+  <!-- <property name="allowByRegex" value="" /> -->
+ </bean>
+ <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">
+  <!-- <property name="ipValidityDurationSeconds" value="21600" /> -->
+  <!-- <property name="robotsValidityDurationSeconds" value="86400" /> -->
+  <!-- <property name="calculateRobotsOnly" value="false" /> -->
+ </bean>
+ <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">
+  <!-- <property name="acceptNonDnsResolves" value="false" /> -->
+  <!-- <property name="digestContent" value="true" /> -->
+  <!-- <property name="digestAlgorithm" value="sha1" /> -->
+ </bean>
+ <!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois">
+       <property name="specialQueryTemplates">
+        <map>
+         <entry key="whois.verisign-grs.com" value="domain %s" />
+         <entry key="whois.arin.net" value="z + %s" />
+         <entry key="whois.denic.de" value="-T dn %s" />
+        </map>
+       </property> 
+      </bean> -->
+ <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">
+  <!-- <property name="useHTTP11" value="false" /> -->
+  <!-- <property name="maxLengthBytes" value="0" /> -->
+  <!-- <property name="timeoutSeconds" value="1200" /> -->
+  <!-- <property name="maxFetchKBSec" value="0" /> -->
+  <!-- <property name="defaultEncoding" value="ISO-8859-1" /> -->
+  <!-- <property name="shouldFetchBodyRule"> 
+        <bean class="org.archive.modules.deciderules.AcceptDecideRule"/>
+       </property> -->
+  <!-- <property name="soTimeoutMs" value="20000" /> -->
+  <!-- <property name="sendIfModifiedSince" value="true" /> -->
+  <!-- <property name="sendIfNoneMatch" value="true" /> -->
+  <!-- <property name="sendConnectionClose" value="true" /> -->
+  <!-- <property name="sendReferer" value="true" /> -->
+  <!-- <property name="sendRange" value="false" /> -->
+  <!-- <property name="ignoreCookies" value="false" /> -->
+  <!-- <property name="sslTrustLevel" value="OPEN" /> -->
+  <!-- <property name="acceptHeaders"> 
+        <list>
+         <value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
+        </list>
+       </property>
+  -->
+  <!-- <property name="httpBindAddress" value="" /> -->
+  <!-- <property name="httpProxyHost" value="" /> -->
+  <!-- <property name="httpProxyPort" value="0" /> -->
+  <!-- <property name="httpProxyUser" value="" /> -->
+  <!-- <property name="httpProxyPassword" value="" /> -->
+  <!-- <property name="digestContent" value="true" /> -->
+  <!-- <property name="digestAlgorithm" value="sha1" /> -->
+ </bean>
+ <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
+ </bean>
+ <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
+  <!-- <property name="extractJavascript" value="true" /> -->
+  <!-- <property name="extractValueAttributes" value="true" /> -->
+  <!-- <property name="ignoreFormActionUrls" value="false" /> -->
+  <!-- <property name="extractOnlyFormGets" value="true" /> -->
+  <!-- <property name="treatFramesAsEmbedLinks" value="true" /> -->
+  <!-- <property name="ignoreUnexpectedHtml" value="true" /> -->
+  <!-- <property name="maxElementLength" value="1024" /> -->
+  <!-- <property name="maxAttributeNameLength" value="1024" /> -->
+  <!-- <property name="maxAttributeValueLength" value="16384" /> -->
+ </bean>
+ <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
+ </bean> 
+ <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">
+ </bean>
+ <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
+ </bean>    
+ <!-- now, processors are assembled into ordered FetchChain bean -->
+ <bean id="fetchProcessors" class="org.archive.modules.FetchChain">
+  <property name="processors">
+   <list>
+    <!-- re-check scope, if so enabled... -->
+    <ref bean="preselector"/>
+    <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
+    <ref bean="preconditions"/>
+    <!-- ...fetch if DNS URI... -->
+    <ref bean="fetchDns"/>
+    <!-- <ref bean="fetchWhois"/> -->
+    <!-- ...fetch if HTTP URI... -->
+    <ref bean="fetchHttp"/>
+    <!-- ...extract outlinks from HTTP headers... -->
+    <ref bean="extractorHttp"/>
+    <!-- ...extract outlinks from HTML content... -->
+    <ref bean="extractorHtml"/>
+    <!-- ...extract outlinks from CSS content... -->
+    <ref bean="extractorCss"/>
+    <!-- ...extract outlinks from Javascript content... -->
+    <ref bean="extractorJs"/>
+    <!-- ...extract outlinks from Flash content... -->
+    <ref bean="extractorSwf"/>
+   </list>
+  </property>
+ </bean>
+  
+ <!-- DISPOSITION CHAIN -->
+ <!-- first, processors are declared as top-level named beans  -->
+ <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
+  <!-- <property name="compress" value="true" /> -->
+  <!-- <property name="prefix" value="IAH" /> -->
+  <!-- <property name="suffix" value="${HOSTNAME}" /> -->
+  <!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
+  <!-- <property name="poolMaxActive" value="1" /> -->
+  <!-- <property name="MaxWaitForIdleMs" value="500" /> -->
+  <!-- <property name="skipIdenticalDigests" value="false" /> -->
+  <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
+  <!-- <property name="directory" value="${launchId}" /> -->
+  <!-- <property name="storePaths">
+        <list>
+         <value>warcs</value>
+        </list>
+       </property> -->
+  <!-- <property name="writeRequests" value="true" /> -->
+  <!-- <property name="writeMetadata" value="true" /> -->
+  <!-- <property name="writeRevisitForIdenticalDigests" value="true" /> -->
+  <!-- <property name="writeRevisitForNotModified" value="true" /> -->
+ </bean>
+ <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
+  <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
+ </bean>
+ <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">
+  <!-- <property name="delayFactor" value="5.0" /> -->
+  <!-- <property name="minDelayMs" value="3000" /> -->
+  <!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> -->
+  <!-- <property name="maxDelayMs" value="30000" /> -->
+  <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
+ </bean>
+ <!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor">
+       <property name="rescheduleDelaySeconds" value="-1" />
+      </bean> -->
+ <!-- now, processors are assembled into ordered DispositionChain bean -->
+ <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
+  <property name="processors">
+   <list>
+    <!-- write to aggregate archival files... -->
+    <ref bean="warcWriter"/>
+    <!-- ...send each outlink candidate URI to CandidateChain, 
+         and enqueue those ACCEPTed to the frontier... -->
+    <ref bean="candidates"/>
+    <!-- ...then update stats, shared-structures, frontier decisions -->
+    <ref bean="disposition"/>
+    <!-- <ref bean="rescheduler" /> -->
+   </list>
+  </property>
+ </bean>
+ 
+ <!-- CRAWLCONTROLLER: Control interface, unifying context -->
+ <bean id="crawlController" 
+   class="org.archive.crawler.framework.CrawlController">
+  <!-- <property name="maxToeThreads" value="25" /> -->
+  <!-- <property name="pauseAtStart" value="true" /> -->
+  <!-- <property name="runWhileEmpty" value="false" /> -->
+  <!-- <property name="recorderInBufferBytes" value="524288" /> -->
+  <!-- <property name="recorderOutBufferBytes" value="16384" /> -->
+  <!-- <property name="scratchDir" value="scratch" /> -->
+ </bean>
+ 
+ <!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->
+ <bean id="frontier" 
+   class="org.archive.crawler.frontier.BdbFrontier">
+  <!-- <property name="queueTotalBudget" value="-1" /> -->
+  <!-- <property name="balanceReplenishAmount" value="3000" /> -->
+  <!-- <property name="errorPenaltyAmount" value="100" /> -->
+  <!-- <property name="precedenceFloor" value="255" /> -->
+  <!-- <property name="queuePrecedencePolicy">
+        <bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" />
+       </property> -->
+  <!-- <property name="snoozeLongMs" value="300000" /> -->
+  <!-- <property name="retryDelaySeconds" value="900" /> -->
+  <!-- <property name="maxRetries" value="30" /> -->
+  <!-- <property name="recoveryLogEnabled" value="true" /> -->
+  <!-- <property name="maxOutlinks" value="6000" /> -->
+  <!-- <property name="extractIndependently" value="false" /> -->
+  <!-- <property name="outbound">
+        <bean class="java.util.concurrent.ArrayBlockingQueue">
+         <constructor-arg value="200"/>
+         <constructor-arg value="true"/>
+        </bean>
+       </property> -->
+  <!-- <property name="inbound">
+        <bean class="java.util.concurrent.ArrayBlockingQueue">
+         <constructor-arg value="40000"/>
+         <constructor-arg value="true"/>
+        </bean>
+       </property> -->
+  <!-- <property name="dumpPendingAtClose" value="false" /> -->
+ </bean>
+ 
+ <!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs --> 
+ <bean id="uriUniqFilter" 
+   class="org.archive.crawler.util.BdbUriUniqFilter">
+ </bean>
+ 
+ <!--
+   EXAMPLE SETTINGS OVERLAY SHEETS
+   Sheets allow some settings to vary by context - usually by URI context,
+   so that different sites or sections of sites can be treated differently. 
+   Here are some example Sheets for common purposes. The SheetOverlaysManager
+   (below) automatically collects all Sheet instances declared among the 
+   original beans, but others can be added during the crawl via the scripting 
+   interface.
+  -->
+
+<!-- forceRetire: any URI to which this sheet's settings are applied 
+     will force its containing queue to 'retired' status. -->
+<bean id='forceRetire' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='disposition.forceRetire' value='true'/>
+  </map>
+ </property>
+</bean>
+
+<!-- smallBudget: any URI to which this sheet's settings are applied 
+     will give its containing queue small values for balanceReplenishAmount 
+     (causing it to have shorter 'active' periods while other queues are 
+     waiting) and queueTotalBudget (causing the queue to enter 'retired' 
+     status once that expenditure is reached by URI attempts and errors) -->
+<bean id='smallBudget' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='frontier.balanceReplenishAmount' value='20'/>
+   <entry key='frontier.queueTotalBudget' value='100'/>
+  </map>
+ </property>
+</bean>
+
+<!-- veryPolite: any URI to which this sheet's settings are applied 
+     will cause its queue to take extra-long politeness snoozes -->
+<bean id='veryPolite' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='disposition.delayFactor' value='10'/>
+   <entry key='disposition.minDelayMs' value='10000'/>
+   <entry key='disposition.maxDelayMs' value='1000000'/>
+   <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/>
+  </map>
+ </property>
+</bean>
+
+<!-- highPrecedence: any URI to which this sheet's settings are applied 
+     will give its containing queue a slightly-higher than default 
+     queue precedence value. That queue will then be preferred over 
+     other queues for active crawling, never waiting behind lower-
+     precedence queues. -->
+<bean id='highPrecedence' class='org.archive.spring.Sheet'>
+ <property name='map'>
+  <map>
+   <entry key='frontier.balanceReplenishAmount' value='20'/>
+   <entry key='frontier.queueTotalBudget' value='100'/>
+  </map>
+ </property>
+</bean>
+
+<!--
+   EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
+   A SheetAssociation says certain URIs should have certain overlay Sheets
+   applied. This example applies two sheets to URIs matching two SURT-prefixes.
+   New associations may also be added mid-crawl using the scripting facility.
+  -->
+
+<!--
+<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
+ <property name='surtPrefixes'>
+  <list>
+   <value>http://(org,example,</value>
+   <value>http://(com,example,www,)/</value>
+  </list>
+ </property>
+ <property name='targetSheetNames'>
+  <list>
+   <value>veryPolite</value>
+   <value>smallBudget</value>
+  </list>
+ </property>
+</bean>
+-->
+
+ <!-- 
+   OPTIONAL BUT RECOMMENDED BEANS
+  -->
+  
+ <!-- ACTIONDIRECTORY: disk directory for mid-crawl operations
+      Running job will watch directory for new files with URIs, 
+      scripts, and other data to be processed during a crawl. -->
+ <bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">
+  <!-- <property name="actionDir" value="action" /> -->
+  <!-- <property name="doneDir" value="${launchId}/actions-done" /> -->
+  <!-- <property name="initialDelaySeconds" value="10" /> -->
+  <!-- <property name="delaySeconds" value="30" /> -->
+ </bean> 
+ 
+ <!--  CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->
+ <bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">
+  <!-- <property name="maxBytesDownload" value="0" /> -->
+  <!-- <property name="maxDocumentsDownload" value="0" /> -->
+  <!-- <property name="maxTimeSeconds" value="0" /> -->
+ </bean>
+ 
+ <!-- CHECKPOINTSERVICE: checkpointing assistance -->
+ <bean id="checkpointService" 
+   class="org.archive.crawler.framework.CheckpointService">
+  <!-- <property name="checkpointIntervalMinutes" value="-1"/> -->
+  <!-- <property name="checkpointsDir" value="checkpoints"/> -->
+ </bean>
+ 
+ <!-- 
+   OPTIONAL BEANS
+    Uncomment and expand as needed, or if non-default alternate 
+    implementations are preferred.
+  -->
+  
+ <!-- CANONICALIZATION POLICY -->
+ <!--
+ <bean id="canonicalizationPolicy" 
+   class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy">
+   <property name="rules">
+    <list>
+     <bean class="org.archive.modules.canonicalize.LowercaseRule" />
+     <bean class="org.archive.modules.canonicalize.StripUserinfoRule" />
+     <bean class="org.archive.modules.canonicalize.StripWWWNRule" />
+     <bean class="org.archive.modules.canonicalize.StripSessionIDs" />
+     <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" />
+     <bean class="org.archive.modules.canonicalize.FixupQueryString" />
+    </list>
+  </property>
+ </bean>
+ -->
+ 
+
+ <!-- QUEUE ASSIGNMENT POLICY -->
+ <!--
+ <bean id="queueAssignmentPolicy" 
+   class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy">
+  <property name="forceQueueAssignment" value="" />
+  <property name="deferToPrevious" value="true" />
+  <property name="parallelQueues" value="1" />
+ </bean>
+ -->
+ 
+ <!-- URI PRECEDENCE POLICY -->
+ <!--
+ <bean id="uriPrecedencePolicy" 
+   class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">
+ </bean>
+ -->
+ 
+ <!-- COST ASSIGNMENT POLICY -->
+ <!--
+ <bean id="costAssignmentPolicy" 
+   class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">
+ </bean>
+ -->
+ 
+ <!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials -->
+ <!-- 
+ <bean id="credentialStore" 
+   class="org.archive.modules.credential.CredentialStore">
+ </bean>
+ -->
+ 
+ <!-- DISK SPACE MONITOR: 
+      Pauses the crawl if disk space at monitored paths falls below minimum threshold -->
+ <!-- 
+ <bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor">
+   <property name="pauseThresholdMiB" value="500" />
+   <property name="monitorConfigPaths" value="true" />
+   <property name="monitorPaths">
+     <list>
+       <value>PATH</value>
+     </list>
+   </property>
+ </bean>
+ -->
+ 
+ <!-- 
+   REQUIRED STANDARD BEANS
+    It will be very rare to replace or reconfigure the following beans.
+  -->
+
+ <!-- STATISTICSTRACKER: standard stats/reporting collector -->
+ <bean id="statisticsTracker" 
+   class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">
+  <!-- <property name="reports">
+        <list>
+         <bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" />
+         <bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" />
+         <bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" />
+         <bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" />
+         <bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" />
+         <bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" />
+         <bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" />
+         <bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" />
+         <bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" />
+         <bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" />
+        </list>
+       </property> -->
+  <!-- <property name="reportsDir" value="${launchId}/reports" /> -->
+  <!-- <property name="liveHostReportSize" value="20" /> -->
+  <!-- <property name="intervalSeconds" value="20" /> -->
+  <!-- <property name="keepSnapshotsCount" value="5" /> -->
+  <!-- <property name="liveHostReportSize" value="20" /> -->
+ </bean>
+ 
+ <!-- CRAWLERLOGGERMODULE: shared logging facility -->
+ <bean id="loggerModule" 
+   class="org.archive.crawler.reporting.CrawlerLoggerModule">
+  <!-- <property name="path" value="${launchId}/logs" /> -->
+  <!-- <property name="crawlLogPath" value="crawl.log" /> -->
+  <!-- <property name="alertsLogPath" value="alerts.log" /> -->
+  <!-- <property name="progressLogPath" value="progress-statistics.log" /> -->
+  <!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> -->
+  <!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> -->
+  <!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> -->
+  <!-- <property name="logExtraInfo" value="false" /> -->
+ </bean>
+ 
+ <!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
+      Autowired to include any SheetForSurtPrefix or 
+      SheetForDecideRuled beans -->
+ <bean id="sheetOverlaysManager" autowire="byType"
+   class="org.archive.crawler.spring.SheetOverlaysManager">
+ </bean>
+
+ <!-- BDBMODULE: shared BDB-JE disk persistence manager -->
+ <bean id="bdb" 
+  class="org.archive.bdb.BdbModule">
+  <!-- <property name="dir" value="state" /> -->
+  <!-- <property name="cachePercent" value="60" /> -->
+  <!-- <property name="useSharedCache" value="true" /> -->
+  <!-- <property name="expectedConcurrency" value="25" /> -->
+ </bean>
+ 
+ <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
+ <bean id="cookieStorage" 
+   class="org.archive.modules.fetcher.BdbCookieStorage">
+  <!-- <property name="cookiesLoadFile"><null/></property> -->
+  <!-- <property name="cookiesSaveFile"><null/></property> -->
+  <!-- <property name="bdb">
+        <ref bean="bdb"/>
+       </property> -->
+ </bean>
+ 
+ <!-- SERVERCACHE: shared cache of server/host info -->
+ <bean id="serverCache" 
+   class="org.archive.modules.net.BdbServerCache">
+  <!-- <property name="bdb">
+        <ref bean="bdb"/>
+       </property> -->
+ </bean>
+
+ <!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative
+      to crawler-beans.cxml file, and tracking crawl files for web UI -->
+ <bean id="configPathConfigurer" 
+   class="org.archive.spring.ConfigPathConfigurer">
+ </bean>
+ 
+</beans>
--- a/archiver/pipelines/actions/heritrix-archive-set.xpl
+++ b/archiver/pipelines/actions/heritrix-archive-set.xpl
@ -112,10 +112,68 @@ for $a in /queue/action where $a/@uuid = $(uuid) return
                </createpath>
            </instance>
        </p:input>
-        <p:output name="response" id="heritrix1" debug="heritrix1"/>
+        <p:output name="response" id="heritrix-engine" debug="heritrix-engine"/>
    </p:processor>
+
+    <!-- Create a job configuration -->
+    <p:processor name="oxf:xslt">
+        <p:input name="data" href="#data"/>
+        <p:input name="config" href="cxml.xslt"/>
+        <p:output name="data" id="cxml"/>
+    </p:processor>
+
+    <!-- Upload the job configuration -->
+    <p:processor name="oxf:xforms-submission">
+        <p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine)">
+            <xforms:submission xsl:version="2.0" method="put" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/primaryConfigUrl}"
+                xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')//config/heritrix/password}" xxforms:preemptive_authentication="no"/>
+        </p:input>
+        <p:input name="request" href="#cxml"/>
+        <p:output name="response" id="cxml-response" debug="cxml-response"/>
+    </p:processor>
+
+
+    <!-- Build the job -->
+    <p:processor name="oxf:xforms-submission">
+        <p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #cxml-response)">
+            <xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
+                xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive_authentication="no">
+                <xforms:header combine="replace">
+                    <xforms:name>Accept</xforms:name>
+                    <xforms:value>application/xml</xforms:value>
+                </xforms:header>
+            </xforms:submission>
+        </p:input>
+        <p:input name="request" transform="oxf:xslt" href="#data">
+            <instance xsl:version="2.0">
+                <action>build</action>
+            </instance>
+        </p:input>
+        <p:output name="response" id="heritrix-built" debug="heritrix-built"/>
+    </p:processor>
+    
+    <!-- Launch the job -->
+    <p:processor name="oxf:xforms-submission">
+        <p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #heritrix-built)">
+            <xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
+                xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive_authentication="no">
+                <xforms:header combine="replace">
+                    <xforms:name>Accept</xforms:name>
+                    <xforms:value>application/xml</xforms:value>
+                </xforms:header>
+            </xforms:submission>
+        </p:input>
+        <p:input name="request" transform="oxf:xslt" href="#data">
+            <instance xsl:version="2.0">
+                <action>launch</action>
+            </instance>
+        </p:input>
+        <p:output name="response" id="heritrix-launched" debug="heritrix-launched"/>
+    </p:processor>
+    
+    
    <p:processor name="oxf:null-serializer">
-        <p:input name="data" href="#heritrix1"/>
+        <p:input name="data" href="#heritrix-launched"/>
    </p:processor>

 </p:config>