Starting to support content lengths in warc archives
This commit is contained in:
parent
9d99928c60
commit
ba51ddfb0b
|
@ -33,41 +33,42 @@
|
|||
<xsl:import href="warc-lib.xsl"/>
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="content" as="node()*">
|
||||
<version/>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>warcinfo</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/warc-fields</value>
|
||||
</field>
|
||||
<!-- TODO: Content-Length: 381 -->
|
||||
<CRLF/>
|
||||
<field>
|
||||
<name>software</name>
|
||||
<value>Owark 0.3 http://owark.org</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>format</name>
|
||||
<value>WARC file version 0.18</value>
|
||||
</field>
|
||||
<CRLF/>
|
||||
<CRLF/>
|
||||
<record>
|
||||
<header>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>warcinfo</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/warc-fields</value>
|
||||
</field>
|
||||
</header>
|
||||
<block>
|
||||
<field>
|
||||
<name>software</name>
|
||||
<value>Owark 0.3 http://owark.org</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>format</name>
|
||||
<value>WARC file version 0.18</value>
|
||||
</field>
|
||||
</block>
|
||||
</record>
|
||||
<!--
|
||||
|
||||
|
||||
|
@ -130,82 +131,87 @@ conformsTo:
|
|||
<xsl:stylesheet version="2.0">
|
||||
<xsl:import href="warc-lib.xsl"/>
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="content" as="node()*">
|
||||
<xsl:variable name="request" as="node()*">
|
||||
<!-- Request -->
|
||||
<version/>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>request</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Target-URI</name>
|
||||
<value>
|
||||
<xsl:value-of select="/archive/request/location"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<!-- TODO: replace that by the archive sate -->
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/http;msgtype=request</value>
|
||||
</field>
|
||||
<!-- TODO: Content-Length: 381 -->
|
||||
<CRLF/>
|
||||
<xsl:apply-templates select="/archive/request" mode="warc"/>
|
||||
<CRLF/>
|
||||
<CRLF/>
|
||||
<!-- Response -->
|
||||
<version/>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>response</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Target-URI</name>
|
||||
<value>
|
||||
<xsl:value-of select="/archive/request/location"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<!-- TODO: replace that by the archive sate -->
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/http;msgtype=response</value>
|
||||
</field>
|
||||
<!-- TODO: Content-Length: 381 -->
|
||||
<CRLF/>
|
||||
<xsl:apply-templates select="/archive/response" mode="warc"/>
|
||||
<CRLF/>
|
||||
|
||||
<record>
|
||||
<header>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>request</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Target-URI</name>
|
||||
<value>
|
||||
<xsl:value-of select="/archive/request/location"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<!-- TODO: replace that by the archive sate -->
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/http;msgtype=request</value>
|
||||
</field>
|
||||
</header>
|
||||
<block>
|
||||
<xsl:apply-templates select="/archive/request" mode="warc-http"/>
|
||||
</block>
|
||||
</record>
|
||||
</xsl:variable>
|
||||
<!-- Response -->
|
||||
<xsl:variable name="response" as="node()*">
|
||||
<record>
|
||||
<header>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>response</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Target-URI</name>
|
||||
<value>
|
||||
<xsl:value-of select="/archive/request/location"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<!-- TODO: replace that by the archive sate -->
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/http;msgtype=response</value>
|
||||
</field>
|
||||
</header>
|
||||
<block>
|
||||
<xsl:apply-templates select="/archive/response" mode="warc-http"/>
|
||||
</block>
|
||||
</record>
|
||||
</xsl:variable>
|
||||
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
|
||||
<xsl:apply-templates select="$content" mode="warc"/>
|
||||
<xsl:apply-templates select="$request" mode="warc"/>
|
||||
<xsl:apply-templates select="$response" mode="warc"/>
|
||||
</document>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
|
|
|
@ -11,10 +11,10 @@
|
|||
|
||||
<xsl:variable name="CRLF" select="' '"/>
|
||||
<xsl:variable name="version">WARC/0.18</xsl:variable>
|
||||
<xsl:template match="CRLF" mode="warc">
|
||||
<xsl:template match="CRLF" name="CRLF" mode="warc">
|
||||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
<xsl:template match="version" mode="warc">
|
||||
<xsl:template match="version" name="version" mode="warc">
|
||||
<xsl:value-of select="$version"/>
|
||||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
|
@ -29,7 +29,36 @@
|
|||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="request" mode="warc">
|
||||
<xsl:template match="record" mode="warc">
|
||||
<xsl:apply-templates select="header" mode="warc"/>
|
||||
<xsl:variable name="block">
|
||||
<xsl:apply-templates select="block" mode="warc"/>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="content-length">
|
||||
<field>
|
||||
<name>Content-Length</name>
|
||||
<value>
|
||||
<xsl:value-of select="string-length($block)"/>
|
||||
</value>
|
||||
</field>
|
||||
</xsl:variable>
|
||||
<xsl:apply-templates select="$content-length" mode="warc"/>
|
||||
<xsl:call-template name="CRLF"/>
|
||||
<xsl:value-of select="$block"/>
|
||||
<xsl:call-template name="CRLF"/>
|
||||
<xsl:call-template name="CRLF"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="block" mode="warc">
|
||||
<xsl:apply-templates mode="warc"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="header" mode="warc">
|
||||
<xsl:call-template name="version"/>
|
||||
<xsl:apply-templates select="*" mode="warc"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="request" mode="warc-http">
|
||||
<line>
|
||||
<xsl:value-of select="method"/>
|
||||
<xsl:text> </xsl:text>
|
||||
|
@ -38,20 +67,20 @@
|
|||
<!-- TODO: get the HTTP version -->
|
||||
<xsl:text>HTTP/1.0</xsl:text>
|
||||
</line>
|
||||
<xsl:apply-templates select="header" mode="warc"/>
|
||||
<xsl:apply-templates select="header" mode="warc-http"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="response" mode="warc">
|
||||
<xsl:template match="response" mode="warc-http">
|
||||
<line>
|
||||
<!-- TODO: get the HTTP version and status-->
|
||||
<xsl:text>HTTP/1.1 </xsl:text>
|
||||
<xsl:value-of select="code"/>
|
||||
<xsl:text> OK</xsl:text>
|
||||
</line>
|
||||
<xsl:apply-templates select="header" mode="warc"/>
|
||||
<xsl:apply-templates select="header" mode="warc-http"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="header" mode="warc">
|
||||
<xsl:template match="header" mode="warc-http">
|
||||
<field>
|
||||
<name>
|
||||
<xsl:value-of select="@name"/>
|
||||
|
@ -62,7 +91,7 @@
|
|||
</field>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()" mode="warc"/>
|
||||
<xsl:template match="text()" mode="warc warc-http"/>
|
||||
|
||||
|
||||
</xsl:stylesheet>
|
||||
|
|
Loading…
Reference in New Issue