Download and convert the crawl log
This commit is contained in:
parent
6f64c7f8a9
commit
675ed04aba
|
@ -55,7 +55,7 @@
|
||||||
<!-- Next action: package -->
|
<!-- Next action: package -->
|
||||||
<p:processor name="oxf:pipeline">
|
<p:processor name="oxf:pipeline">
|
||||||
<p:input name="config" href="/data-access.xpl"/>
|
<p:input name="config" href="/data-access.xpl"/>
|
||||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list)">
|
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)">
|
||||||
<config xsl:version="2.0">
|
<config xsl:version="2.0">
|
||||||
<relpath>queue.xml</relpath>
|
<relpath>queue.xml</relpath>
|
||||||
<operation>write</operation>
|
<operation>write</operation>
|
||||||
|
@ -78,6 +78,9 @@
|
||||||
<parameter name="warc-url" type="string">
|
<parameter name="warc-url" type="string">
|
||||||
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
|
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
|
||||||
</parameter>
|
</parameter>
|
||||||
|
<parameter name="log-url" type="string">
|
||||||
|
<xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/>
|
||||||
|
</parameter>
|
||||||
</config>
|
</config>
|
||||||
</p:input>
|
</p:input>
|
||||||
<p:input name="param">
|
<p:input name="param">
|
||||||
|
@ -86,7 +89,7 @@ declare namespace util = "http://exist-db.org/xquery/util";
|
||||||
|
|
||||||
for $q in /queue return
|
for $q in /queue return
|
||||||
update
|
update
|
||||||
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url)/>
|
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/>
|
||||||
into $q,
|
into $q,
|
||||||
|
|
||||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||||
|
|
|
@ -43,7 +43,40 @@
|
||||||
<p:input name="data" href="#warc-xml"/>
|
<p:input name="data" href="#warc-xml"/>
|
||||||
</p:processor>
|
</p:processor>
|
||||||
|
|
||||||
<!-- Store it in a temp file -->
|
<!-- Download the log -->
|
||||||
|
<p:processor name="oxf:url-generator">
|
||||||
|
<p:input name="config" transform="oxf:xslt" href="#data">
|
||||||
|
<config xsl:version="2.0">
|
||||||
|
<url>
|
||||||
|
<xsl:value-of select="/action/@log-url"/>
|
||||||
|
</url>
|
||||||
|
<mode>text</mode>
|
||||||
|
<authentication>
|
||||||
|
<username>
|
||||||
|
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
|
||||||
|
</username>
|
||||||
|
<password>
|
||||||
|
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
|
||||||
|
</password>
|
||||||
|
<preemptive>false</preemptive>
|
||||||
|
</authentication>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:output name="data" id="log" debug="log"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<p:processor name="oxf:xslt">
|
||||||
|
<p:input name="data" href="#log"/>
|
||||||
|
<p:input name="config" href="parse-log.xslt"></p:input>
|
||||||
|
<p:output name="data" id="log-xml" debug="log-xml"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<p:processor name="oxf:null-serializer">
|
||||||
|
<p:input name="data" href="#log-xml"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Store the WARC in a temp file -->
|
||||||
<p:processor name="oxf:file-serializer">
|
<p:processor name="oxf:file-serializer">
|
||||||
<p:input name="config">
|
<p:input name="config">
|
||||||
<config>
|
<config>
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
|
||||||
|
version="2.0">
|
||||||
|
<xd:doc scope="stylesheet">
|
||||||
|
<xd:desc>
|
||||||
|
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
|
||||||
|
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||||
|
<xd:p>See https://webarchive.jira.com/wiki/display/Heritrix/Logs</xd:p>
|
||||||
|
</xd:desc>
|
||||||
|
</xd:doc>
|
||||||
|
|
||||||
|
|
||||||
|
<xsl:template match="/document">
|
||||||
|
<log>
|
||||||
|
<xsl:for-each select="tokenize(., '\n')[. != '']">
|
||||||
|
<entry>
|
||||||
|
<date-time>
|
||||||
|
<xsl:value-of select="substring(., 1, 24)"/>
|
||||||
|
</date-time>
|
||||||
|
<code>
|
||||||
|
<xsl:value-of select="normalize-space(substring(., 26, 5))"/>
|
||||||
|
</code>
|
||||||
|
<size>
|
||||||
|
<xsl:value-of select="normalize-space(substring(., 33, 10))"/>
|
||||||
|
</size>
|
||||||
|
<xsl:variable name="tail" select="substring(., 43)"/>
|
||||||
|
<xsl:variable name="tokens" select="tokenize($tail, ' ')"/>
|
||||||
|
<uri>
|
||||||
|
<xsl:value-of select="$tokens[1]"/>
|
||||||
|
</uri>
|
||||||
|
<discovery-path>
|
||||||
|
<xsl:value-of select="$tokens[2]"/>
|
||||||
|
</discovery-path>
|
||||||
|
<referer>
|
||||||
|
<xsl:value-of select="$tokens[3]"/>
|
||||||
|
</referer>
|
||||||
|
<content-type>
|
||||||
|
<xsl:value-of select="$tokens[4]"/>
|
||||||
|
</content-type>
|
||||||
|
<timestamp>
|
||||||
|
<xsl:value-of select="$tokens[6]"/>
|
||||||
|
</timestamp>
|
||||||
|
<sha1-digest>
|
||||||
|
<xsl:value-of select="$tokens[7]"/>
|
||||||
|
</sha1-digest>
|
||||||
|
</entry>
|
||||||
|
</xsl:for-each>
|
||||||
|
</log>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
</xsl:stylesheet>
|
Loading…
Reference in New Issue