Adding a basic squeleton to generate what should ultimately be a WARC archive
This commit is contained in:
parent
3d18e9d8a4
commit
0e7bdd1de4
|
@ -31,7 +31,10 @@
|
||||||
<parameter name="url" type="string">
|
<parameter name="url" type="string">
|
||||||
<xsl:value-of select="/action/@url"/>
|
<xsl:value-of select="/action/@url"/>
|
||||||
</parameter>
|
</parameter>
|
||||||
<parameter name="priority" type="string">
|
<parameter name="priority-resource" type="string">
|
||||||
|
<xsl:value-of select="/action/@priority + 2"/>
|
||||||
|
</parameter>
|
||||||
|
<parameter name="priority-package" type="string">
|
||||||
<xsl:value-of select="/action/@priority + 1"/>
|
<xsl:value-of select="/action/@priority + 1"/>
|
||||||
</parameter>
|
</parameter>
|
||||||
</config>
|
</config>
|
||||||
|
@ -48,7 +51,8 @@ declare namespace util = "http://exist-db.org/xquery/util";
|
||||||
|
|
||||||
for $q in /queue return
|
for $q in /queue return
|
||||||
update
|
update
|
||||||
insert <action priority=$(priority) uuid="{util:uuid()}" type="archive-resource" url=$(url) directory=$(directory) filename=$(filename)/>
|
insert (<action priority=$(priority-resource) uuid="{util:uuid()}" type="archive-resource" url=$(url) directory=$(directory) filename=$(filename)/>,
|
||||||
|
<action priority=$(priority-package) uuid="{util:uuid()}" type="package-archive" directory=$(directory)/>)
|
||||||
into $q,
|
into $q,
|
||||||
|
|
||||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||||
|
|
|
@ -0,0 +1,292 @@
|
||||||
|
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||||
|
|
||||||
|
<p:param name="data" type="input"/>
|
||||||
|
|
||||||
|
<!-- Read the archive index -->
|
||||||
|
<p:processor name="oxf:pipeline">
|
||||||
|
<p:input name="config" href="/data-access.xpl"/>
|
||||||
|
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||||
|
<config xsl:version="2.0">
|
||||||
|
<relpath>
|
||||||
|
<xsl:value-of select="/action/@directory"/>
|
||||||
|
<xsl:text>index.xml</xsl:text>
|
||||||
|
</relpath>
|
||||||
|
<operation>read</operation>
|
||||||
|
<type>document</type>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="param">
|
||||||
|
<empty/>
|
||||||
|
</p:input>
|
||||||
|
<p:output name="data" id="index" debug="index"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<!-- Create a WARC file -->
|
||||||
|
<p:processor name="oxf:file-serializer">
|
||||||
|
<p:input name="config">
|
||||||
|
<config>
|
||||||
|
<scope>request</scope>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #index, #data)">
|
||||||
|
<xsl:stylesheet version="2.0">
|
||||||
|
<xsl:import href="warc-lib.xsl"/>
|
||||||
|
<xsl:template match="/">
|
||||||
|
<xsl:variable name="content" as="node()*">
|
||||||
|
<version/>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Type</name>
|
||||||
|
<value>warcinfo</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Date</name>
|
||||||
|
<value>
|
||||||
|
<xsl:value-of select="current-dateTime()"/>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Record-ID</name>
|
||||||
|
<value>
|
||||||
|
<xsl:text><urn:uuid:</xsl:text>
|
||||||
|
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||||
|
<xsl:text>></xsl:text>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>Content-Type</name>
|
||||||
|
<value>application/warc-fields</value>
|
||||||
|
</field>
|
||||||
|
<!-- TODO: Content-Length: 381 -->
|
||||||
|
<CRLF/>
|
||||||
|
<field>
|
||||||
|
<name>software</name>
|
||||||
|
<value>Owark 0.3 http://owark.org</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>format</name>
|
||||||
|
<value>WARC file version 0.18</value>
|
||||||
|
</field>
|
||||||
|
<CRLF/>
|
||||||
|
<CRLF/>
|
||||||
|
<!--
|
||||||
|
|
||||||
|
|
||||||
|
software: Heritrix 1.12.0 http://crawler.archive.org
|
||||||
|
hostname: crawling017.archive.org
|
||||||
|
ip: 207.241.227.234
|
||||||
|
isPartOf: testcrawl-20050708
|
||||||
|
description: testcrawl with WARC output
|
||||||
|
operator: IA_Admin
|
||||||
|
http-header-user-agent:
|
||||||
|
Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org)
|
||||||
|
format: WARC file version 0.18
|
||||||
|
conformsTo:
|
||||||
|
http://www.archive.org/documents/WarcFileFormat-0.18.html-->
|
||||||
|
</xsl:variable>
|
||||||
|
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
|
||||||
|
<xsl:apply-templates select="$content" mode="warc"/>
|
||||||
|
</document>
|
||||||
|
</xsl:template>
|
||||||
|
</xsl:stylesheet>
|
||||||
|
</p:input>
|
||||||
|
<p:output name="data" id="warc" debug="warc"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<!-- Loop over the index to retrieve the documents -->
|
||||||
|
|
||||||
|
<p:for-each href="#index" select="/archive-set/archive" id="files" root="files">
|
||||||
|
|
||||||
|
<!-- Read the document -->
|
||||||
|
<p:processor name="oxf:pipeline">
|
||||||
|
<p:input name="config" href="/data-access.xpl"/>
|
||||||
|
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, current())">
|
||||||
|
<config xsl:version="2.0">
|
||||||
|
<relpath>
|
||||||
|
<xsl:value-of select="/root/action/@directory"/>
|
||||||
|
<xsl:value-of select="/root/archive/@href"/>
|
||||||
|
</relpath>
|
||||||
|
<operation>read</operation>
|
||||||
|
<type>document</type>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="param">
|
||||||
|
<empty/>
|
||||||
|
</p:input>
|
||||||
|
<p:output name="data" id="document" debug="document"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<!-- Add the request and start of response records -->
|
||||||
|
<p:processor name="oxf:file-serializer">
|
||||||
|
<p:input name="config" transform="oxf:xslt" href="#warc">
|
||||||
|
<config xsl:version="2.0">
|
||||||
|
<file>
|
||||||
|
<xsl:value-of select="substring-after(/url, 'file:')"/>
|
||||||
|
</file>
|
||||||
|
<make-directories>false</make-directories>
|
||||||
|
<append>true</append>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="data" transform="oxf:xslt" href="#document">
|
||||||
|
<xsl:stylesheet version="2.0">
|
||||||
|
<xsl:import href="warc-lib.xsl"/>
|
||||||
|
<xsl:template match="/">
|
||||||
|
<xsl:variable name="content" as="node()*">
|
||||||
|
<!-- Request -->
|
||||||
|
<version/>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Type</name>
|
||||||
|
<value>request</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Target-URI</name>
|
||||||
|
<value>
|
||||||
|
<xsl:value-of select="/archive/request/location"/>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Date</name>
|
||||||
|
<value>
|
||||||
|
<!-- TODO: replace that by the archive sate -->
|
||||||
|
<xsl:value-of select="current-dateTime()"/>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Record-ID</name>
|
||||||
|
<value>
|
||||||
|
<xsl:text><urn:uuid:</xsl:text>
|
||||||
|
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||||
|
<xsl:text>></xsl:text>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>Content-Type</name>
|
||||||
|
<value>application/http;msgtype=request</value>
|
||||||
|
</field>
|
||||||
|
<!-- TODO: Content-Length: 381 -->
|
||||||
|
<CRLF/>
|
||||||
|
<xsl:apply-templates select="/archive/request" mode="warc"/>
|
||||||
|
<CRLF/>
|
||||||
|
<CRLF/>
|
||||||
|
<!-- Response -->
|
||||||
|
<version/>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Type</name>
|
||||||
|
<value>response</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Target-URI</name>
|
||||||
|
<value>
|
||||||
|
<xsl:value-of select="/archive/request/location"/>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Date</name>
|
||||||
|
<value>
|
||||||
|
<!-- TODO: replace that by the archive sate -->
|
||||||
|
<xsl:value-of select="current-dateTime()"/>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>WARC-Record-ID</name>
|
||||||
|
<value>
|
||||||
|
<xsl:text><urn:uuid:</xsl:text>
|
||||||
|
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||||
|
<xsl:text>></xsl:text>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
<field>
|
||||||
|
<name>Content-Type</name>
|
||||||
|
<value>application/http;msgtype=response</value>
|
||||||
|
</field>
|
||||||
|
<!-- TODO: Content-Length: 381 -->
|
||||||
|
<CRLF/>
|
||||||
|
<xsl:apply-templates select="/archive/response" mode="warc"/>
|
||||||
|
<CRLF/>
|
||||||
|
|
||||||
|
</xsl:variable>
|
||||||
|
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
|
||||||
|
<xsl:apply-templates select="$content" mode="warc"/>
|
||||||
|
</document>
|
||||||
|
</xsl:template>
|
||||||
|
</xsl:stylesheet>
|
||||||
|
</p:input>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<!-- Add the response document to finalize the response record -->
|
||||||
|
<p:processor name="oxf:file-serializer">
|
||||||
|
<p:input name="config" transform="oxf:xslt" href="#warc">
|
||||||
|
<config xsl:version="2.0">
|
||||||
|
<file>
|
||||||
|
<xsl:value-of select="substring-after(/url, 'file:')"/>
|
||||||
|
</file>
|
||||||
|
<make-directories>false</make-directories>
|
||||||
|
<append>true</append>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="data" href="#document#xpointer(/archive/response/document)"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<p:processor name="oxf:identity">
|
||||||
|
<p:input name="data" href="current()"/>
|
||||||
|
<p:output name="data" ref="files"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
|
||||||
|
</p:for-each>
|
||||||
|
|
||||||
|
<p:processor name="oxf:null-serializer">
|
||||||
|
<p:input name="data" href="#files"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<p:processor name="oxf:zip">
|
||||||
|
<p:input name="data" transform="oxf:xslt" href="#warc">
|
||||||
|
<files xsl:version="2.0" file-name="archive.zip">
|
||||||
|
<file name="archive.warc">
|
||||||
|
<xsl:value-of select="/url"/>
|
||||||
|
</file>
|
||||||
|
</files>
|
||||||
|
</p:input>
|
||||||
|
<p:output name="data" id="zip"/>
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<p:processor name="oxf:file-serializer">
|
||||||
|
<p:input name="config">
|
||||||
|
<config>
|
||||||
|
<file>/tmp/archive.zip</file>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="data" href="#zip"/>
|
||||||
|
|
||||||
|
</p:processor>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
<!-\- Update the queue -\->
|
||||||
|
<p:processor name="oxf:pipeline">
|
||||||
|
<p:input name="config" href="/data-access.xpl"/>
|
||||||
|
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||||
|
<config xsl:version="2.0">
|
||||||
|
<relpath>queue.xml</relpath>
|
||||||
|
<operation>write</operation>
|
||||||
|
<type>xquery</type>
|
||||||
|
<parameter name="uuid" type="string">
|
||||||
|
<xsl:value-of select="/action/@uuid"/>
|
||||||
|
</parameter>
|
||||||
|
</config>
|
||||||
|
</p:input>
|
||||||
|
<p:input name="param">
|
||||||
|
<xquery><![CDATA[
|
||||||
|
|
||||||
|
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||||
|
update
|
||||||
|
delete $a
|
||||||
|
|
||||||
|
]]></xquery>
|
||||||
|
</p:input>
|
||||||
|
<p:output name="data" id="response4" debug="response"/>
|
||||||
|
</p:processor>
|
||||||
|
<p:processor name="oxf:null-serializer">
|
||||||
|
<p:input name="data" href="#response4"/>
|
||||||
|
</p:processor>-->
|
||||||
|
|
||||||
|
|
||||||
|
</p:config>
|
|
@ -0,0 +1,68 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
|
||||||
|
version="2.0">
|
||||||
|
<xd:doc scope="stylesheet">
|
||||||
|
<xd:desc>
|
||||||
|
<xd:p><xd:b>Created on:</xd:b> Apr 13, 2012</xd:p>
|
||||||
|
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||||
|
<xd:p>Template library to produce WARC documents</xd:p>
|
||||||
|
</xd:desc>
|
||||||
|
</xd:doc>
|
||||||
|
|
||||||
|
<xsl:variable name="CRLF" select="' '"/>
|
||||||
|
<xsl:variable name="version">WARC/0.18</xsl:variable>
|
||||||
|
<xsl:template match="CRLF" mode="warc">
|
||||||
|
<xsl:value-of select="$CRLF"/>
|
||||||
|
</xsl:template>
|
||||||
|
<xsl:template match="version" mode="warc">
|
||||||
|
<xsl:value-of select="$version"/>
|
||||||
|
<xsl:value-of select="$CRLF"/>
|
||||||
|
</xsl:template>
|
||||||
|
<xsl:template match="field" mode="warc">
|
||||||
|
<xsl:value-of select="name"/>
|
||||||
|
<xsl:text>: </xsl:text>
|
||||||
|
<xsl:value-of select="value"/>
|
||||||
|
<xsl:value-of select="$CRLF"/>
|
||||||
|
</xsl:template>
|
||||||
|
<xsl:template match="line" mode="warc">
|
||||||
|
<xsl:value-of select="."/>
|
||||||
|
<xsl:value-of select="$CRLF"/>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="request" mode="warc">
|
||||||
|
<line>
|
||||||
|
<xsl:value-of select="method"/>
|
||||||
|
<xsl:text> </xsl:text>
|
||||||
|
<xsl:value-of select="location"/>
|
||||||
|
<xsl:text> </xsl:text>
|
||||||
|
<!-- TODO: get the HTTP version -->
|
||||||
|
<xsl:text>HTTP/1.0</xsl:text>
|
||||||
|
</line>
|
||||||
|
<xsl:apply-templates select="header" mode="warc"/>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="response" mode="warc">
|
||||||
|
<line>
|
||||||
|
<!-- TODO: get the HTTP version and status-->
|
||||||
|
<xsl:text>HTTP/1.1 </xsl:text>
|
||||||
|
<xsl:value-of select="code"/>
|
||||||
|
<xsl:text> OK</xsl:text>
|
||||||
|
</line>
|
||||||
|
<xsl:apply-templates select="header" mode="warc"/>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="header" mode="warc">
|
||||||
|
<field>
|
||||||
|
<name>
|
||||||
|
<xsl:value-of select="@name"/>
|
||||||
|
</name>
|
||||||
|
<value>
|
||||||
|
<xsl:value-of select="."/>
|
||||||
|
</value>
|
||||||
|
</field>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="text()" mode="warc"/>
|
||||||
|
|
||||||
|
|
||||||
|
</xsl:stylesheet>
|
Loading…
Reference in New Issue