Download and convert the crawl log
This commit is contained in:
parent
6f64c7f8a9
commit
675ed04aba
|
@ -55,7 +55,7 @@
|
|||
<!-- Next action: package -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list)">
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
|
@ -78,6 +78,9 @@
|
|||
<parameter name="warc-url" type="string">
|
||||
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
|
||||
</parameter>
|
||||
<parameter name="log-url" type="string">
|
||||
<xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
|
@ -86,7 +89,7 @@ declare namespace util = "http://exist-db.org/xquery/util";
|
|||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url)/>
|
||||
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
|
|
|
@ -34,16 +34,49 @@
|
|||
<p:output name="data" id="warc"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="owk:from-warc-converter">
|
||||
<p:input name="data" href="#warc"/>
|
||||
<p:output name="data" id="warc-xml" debug="warc-xml"/>
|
||||
</p:processor>
|
||||
<p:processor name="owk:from-warc-converter">
|
||||
<p:input name="data" href="#warc"/>
|
||||
<p:output name="data" id="warc-xml" debug="warc-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#warc-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Store it in a temp file -->
|
||||
<!-- Download the log -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/action/@log-url"/>
|
||||
</url>
|
||||
<mode>text</mode>
|
||||
<authentication>
|
||||
<username>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
|
||||
</username>
|
||||
<password>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
|
||||
</password>
|
||||
<preemptive>false</preemptive>
|
||||
</authentication>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="log" debug="log"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#log"/>
|
||||
<p:input name="config" href="parse-log.xslt"></p:input>
|
||||
<p:output name="data" id="log-xml" debug="log-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#log-xml"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Store the WARC in a temp file -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
|
@ -94,7 +127,7 @@
|
|||
|
||||
</p:processor>
|
||||
|
||||
<!-- <p:choose href="#heritrix-job">
|
||||
<!-- <p:choose href="#heritrix-job">
|
||||
<p:when test="/job/crawlControllerState='FINISHED'">
|
||||
<!-\- The job is finished, we can get its archive... -\->
|
||||
<!-\- Scan the directory to find the name of the WARC file -\->
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
|
||||
version="2.0">
|
||||
<xd:doc scope="stylesheet">
|
||||
<xd:desc>
|
||||
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
|
||||
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||
<xd:p>See https://webarchive.jira.com/wiki/display/Heritrix/Logs</xd:p>
|
||||
</xd:desc>
|
||||
</xd:doc>
|
||||
|
||||
|
||||
<xsl:template match="/document">
|
||||
<log>
|
||||
<xsl:for-each select="tokenize(., '\n')[. != '']">
|
||||
<entry>
|
||||
<date-time>
|
||||
<xsl:value-of select="substring(., 1, 24)"/>
|
||||
</date-time>
|
||||
<code>
|
||||
<xsl:value-of select="normalize-space(substring(., 26, 5))"/>
|
||||
</code>
|
||||
<size>
|
||||
<xsl:value-of select="normalize-space(substring(., 33, 10))"/>
|
||||
</size>
|
||||
<xsl:variable name="tail" select="substring(., 43)"/>
|
||||
<xsl:variable name="tokens" select="tokenize($tail, ' ')"/>
|
||||
<uri>
|
||||
<xsl:value-of select="$tokens[1]"/>
|
||||
</uri>
|
||||
<discovery-path>
|
||||
<xsl:value-of select="$tokens[2]"/>
|
||||
</discovery-path>
|
||||
<referer>
|
||||
<xsl:value-of select="$tokens[3]"/>
|
||||
</referer>
|
||||
<content-type>
|
||||
<xsl:value-of select="$tokens[4]"/>
|
||||
</content-type>
|
||||
<timestamp>
|
||||
<xsl:value-of select="$tokens[6]"/>
|
||||
</timestamp>
|
||||
<sha1-digest>
|
||||
<xsl:value-of select="$tokens[7]"/>
|
||||
</sha1-digest>
|
||||
</entry>
|
||||
</xsl:for-each>
|
||||
</log>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
Loading…
Reference in New Issue