Download and convert the crawl log

This commit is contained in:
Eric van der Vlist 2012-04-26 17:08:28 +02:00
parent 6f64c7f8a9
commit 675ed04aba
3 changed files with 99 additions and 12 deletions

View File

@ -55,7 +55,7 @@
<!-- Next action: package -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list)">
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
@ -78,6 +78,9 @@
<parameter name="warc-url" type="string">
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
</parameter>
<parameter name="log-url" type="string">
<xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/>
</parameter>
</config>
</p:input>
<p:input name="param">
@ -86,7 +89,7 @@ declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url)/>
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return

View File

@ -34,16 +34,49 @@
<p:output name="data" id="warc"/>
</p:processor>
<p:processor name="owk:from-warc-converter">
<p:input name="data" href="#warc"/>
<p:output name="data" id="warc-xml" debug="warc-xml"/>
</p:processor>
<p:processor name="owk:from-warc-converter">
<p:input name="data" href="#warc"/>
<p:output name="data" id="warc-xml" debug="warc-xml"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#warc-xml"/>
</p:processor>
<!-- Store it in a temp file -->
<!-- Download the log -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/action/@log-url"/>
</url>
<mode>text</mode>
<authentication>
<username>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
</username>
<password>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
</password>
<preemptive>false</preemptive>
</authentication>
</config>
</p:input>
<p:output name="data" id="log" debug="log"/>
</p:processor>
<p:processor name="oxf:xslt">
<p:input name="data" href="#log"/>
<p:input name="config" href="parse-log.xslt"></p:input>
<p:output name="data" id="log-xml" debug="log-xml"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#log-xml"/>
</p:processor>
<!-- Store the WARC in a temp file -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
@ -83,7 +116,7 @@
</p:input>
<p:output name="data" id="zip"/>
</p:processor>
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
@ -91,10 +124,10 @@
</config>
</p:input>
<p:input name="data" href="#zip"/>
</p:processor>
<!-- <p:choose href="#heritrix-job">
<!-- <p:choose href="#heritrix-job">
<p:when test="/job/crawlControllerState='FINISHED'">
<!-\- The job is finished, we can get its archive... -\->
<!-\- Scan the directory to find the name of the WARC file -\->

View File

@ -0,0 +1,51 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
version="2.0">
<xd:doc scope="stylesheet">
<xd:desc>
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
<xd:p>See https://webarchive.jira.com/wiki/display/Heritrix/Logs</xd:p>
</xd:desc>
</xd:doc>
<xsl:template match="/document">
<log>
<xsl:for-each select="tokenize(., '\n')[. != '']">
<entry>
<date-time>
<xsl:value-of select="substring(., 1, 24)"/>
</date-time>
<code>
<xsl:value-of select="normalize-space(substring(., 26, 5))"/>
</code>
<size>
<xsl:value-of select="normalize-space(substring(., 33, 10))"/>
</size>
<xsl:variable name="tail" select="substring(., 43)"/>
<xsl:variable name="tokens" select="tokenize($tail, ' ')"/>
<uri>
<xsl:value-of select="$tokens[1]"/>
</uri>
<discovery-path>
<xsl:value-of select="$tokens[2]"/>
</discovery-path>
<referer>
<xsl:value-of select="$tokens[3]"/>
</referer>
<content-type>
<xsl:value-of select="$tokens[4]"/>
</content-type>
<timestamp>
<xsl:value-of select="$tokens[6]"/>
</timestamp>
<sha1-digest>
<xsl:value-of select="$tokens[7]"/>
</sha1-digest>
</entry>
</xsl:for-each>
</log>
</xsl:template>
</xsl:stylesheet>