Starting to support content lengths in warc archives

This commit is contained in:
Eric van der Vlist 2012-04-14 22:32:33 +02:00
parent 9d99928c60
commit ba51ddfb0b
2 changed files with 151 additions and 116 deletions

View File

@ -33,41 +33,42 @@
<xsl:import href="warc-lib.xsl"/> <xsl:import href="warc-lib.xsl"/>
<xsl:template match="/"> <xsl:template match="/">
<xsl:variable name="content" as="node()*"> <xsl:variable name="content" as="node()*">
<version/> <record>
<field> <header>
<name>WARC-Type</name> <field>
<value>warcinfo</value> <name>WARC-Type</name>
</field> <value>warcinfo</value>
<field> </field>
<name>WARC-Date</name> <field>
<value> <name>WARC-Date</name>
<xsl:value-of select="current-dateTime()"/> <value>
</value> <xsl:value-of select="current-dateTime()"/>
</field> </value>
<field> </field>
<name>WARC-Record-ID</name> <field>
<value> <name>WARC-Record-ID</name>
<xsl:text>&lt;urn:uuid:</xsl:text> <value>
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/> <xsl:text>&lt;urn:uuid:</xsl:text>
<xsl:text>></xsl:text> <xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
</value> <xsl:text>></xsl:text>
</field> </value>
<field> </field>
<name>Content-Type</name> <field>
<value>application/warc-fields</value> <name>Content-Type</name>
</field> <value>application/warc-fields</value>
<!-- TODO: Content-Length: 381 --> </field>
<CRLF/> </header>
<field> <block>
<name>software</name> <field>
<value>Owark 0.3 http://owark.org</value> <name>software</name>
</field> <value>Owark 0.3 http://owark.org</value>
<field> </field>
<name>format</name> <field>
<value>WARC file version 0.18</value> <name>format</name>
</field> <value>WARC file version 0.18</value>
<CRLF/> </field>
<CRLF/> </block>
</record>
<!-- <!--
@ -130,82 +131,87 @@ conformsTo:
<xsl:stylesheet version="2.0"> <xsl:stylesheet version="2.0">
<xsl:import href="warc-lib.xsl"/> <xsl:import href="warc-lib.xsl"/>
<xsl:template match="/"> <xsl:template match="/">
<xsl:variable name="content" as="node()*"> <xsl:variable name="request" as="node()*">
<!-- Request --> <!-- Request -->
<version/> <record>
<field> <header>
<name>WARC-Type</name> <field>
<value>request</value> <name>WARC-Type</name>
</field> <value>request</value>
<field> </field>
<name>WARC-Target-URI</name> <field>
<value> <name>WARC-Target-URI</name>
<xsl:value-of select="/archive/request/location"/> <value>
</value> <xsl:value-of select="/archive/request/location"/>
</field> </value>
<field> </field>
<name>WARC-Date</name> <field>
<value> <name>WARC-Date</name>
<!-- TODO: replace that by the archive sate --> <value>
<xsl:value-of select="current-dateTime()"/> <!-- TODO: replace that by the archive sate -->
</value> <xsl:value-of select="current-dateTime()"/>
</field> </value>
<field> </field>
<name>WARC-Record-ID</name> <field>
<value> <name>WARC-Record-ID</name>
<xsl:text>&lt;urn:uuid:</xsl:text> <value>
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/> <xsl:text>&lt;urn:uuid:</xsl:text>
<xsl:text>></xsl:text> <xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
</value> <xsl:text>></xsl:text>
</field> </value>
<field> </field>
<name>Content-Type</name> <field>
<value>application/http;msgtype=request</value> <name>Content-Type</name>
</field> <value>application/http;msgtype=request</value>
<!-- TODO: Content-Length: 381 --> </field>
<CRLF/> </header>
<xsl:apply-templates select="/archive/request" mode="warc"/> <block>
<CRLF/> <xsl:apply-templates select="/archive/request" mode="warc-http"/>
<CRLF/> </block>
<!-- Response --> </record>
<version/> </xsl:variable>
<field> <!-- Response -->
<name>WARC-Type</name> <xsl:variable name="response" as="node()*">
<value>response</value> <record>
</field> <header>
<field> <field>
<name>WARC-Target-URI</name> <name>WARC-Type</name>
<value> <value>response</value>
<xsl:value-of select="/archive/request/location"/> </field>
</value> <field>
</field> <name>WARC-Target-URI</name>
<field> <value>
<name>WARC-Date</name> <xsl:value-of select="/archive/request/location"/>
<value> </value>
<!-- TODO: replace that by the archive sate --> </field>
<xsl:value-of select="current-dateTime()"/> <field>
</value> <name>WARC-Date</name>
</field> <value>
<field> <!-- TODO: replace that by the archive sate -->
<name>WARC-Record-ID</name> <xsl:value-of select="current-dateTime()"/>
<value> </value>
<xsl:text>&lt;urn:uuid:</xsl:text> </field>
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/> <field>
<xsl:text>></xsl:text> <name>WARC-Record-ID</name>
</value> <value>
</field> <xsl:text>&lt;urn:uuid:</xsl:text>
<field> <xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
<name>Content-Type</name> <xsl:text>></xsl:text>
<value>application/http;msgtype=response</value> </value>
</field> </field>
<!-- TODO: Content-Length: 381 --> <field>
<CRLF/> <name>Content-Type</name>
<xsl:apply-templates select="/archive/response" mode="warc"/> <value>application/http;msgtype=response</value>
<CRLF/> </field>
</header>
<block>
<xsl:apply-templates select="/archive/response" mode="warc-http"/>
</block>
</record>
</xsl:variable> </xsl:variable>
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain"> <document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
<xsl:apply-templates select="$content" mode="warc"/> <xsl:apply-templates select="$request" mode="warc"/>
<xsl:apply-templates select="$response" mode="warc"/>
</document> </document>
</xsl:template> </xsl:template>
</xsl:stylesheet> </xsl:stylesheet>

View File

@ -11,10 +11,10 @@
<xsl:variable name="CRLF" select="'&#13;&#10;'"/> <xsl:variable name="CRLF" select="'&#13;&#10;'"/>
<xsl:variable name="version">WARC/0.18</xsl:variable> <xsl:variable name="version">WARC/0.18</xsl:variable>
<xsl:template match="CRLF" mode="warc"> <xsl:template match="CRLF" name="CRLF" mode="warc">
<xsl:value-of select="$CRLF"/> <xsl:value-of select="$CRLF"/>
</xsl:template> </xsl:template>
<xsl:template match="version" mode="warc"> <xsl:template match="version" name="version" mode="warc">
<xsl:value-of select="$version"/> <xsl:value-of select="$version"/>
<xsl:value-of select="$CRLF"/> <xsl:value-of select="$CRLF"/>
</xsl:template> </xsl:template>
@ -29,7 +29,36 @@
<xsl:value-of select="$CRLF"/> <xsl:value-of select="$CRLF"/>
</xsl:template> </xsl:template>
<xsl:template match="request" mode="warc"> <xsl:template match="record" mode="warc">
<xsl:apply-templates select="header" mode="warc"/>
<xsl:variable name="block">
<xsl:apply-templates select="block" mode="warc"/>
</xsl:variable>
<xsl:variable name="content-length">
<field>
<name>Content-Length</name>
<value>
<xsl:value-of select="string-length($block)"/>
</value>
</field>
</xsl:variable>
<xsl:apply-templates select="$content-length" mode="warc"/>
<xsl:call-template name="CRLF"/>
<xsl:value-of select="$block"/>
<xsl:call-template name="CRLF"/>
<xsl:call-template name="CRLF"/>
</xsl:template>
<xsl:template match="block" mode="warc">
<xsl:apply-templates mode="warc"/>
</xsl:template>
<xsl:template match="header" mode="warc">
<xsl:call-template name="version"/>
<xsl:apply-templates select="*" mode="warc"/>
</xsl:template>
<xsl:template match="request" mode="warc-http">
<line> <line>
<xsl:value-of select="method"/> <xsl:value-of select="method"/>
<xsl:text> </xsl:text> <xsl:text> </xsl:text>
@ -38,20 +67,20 @@
<!-- TODO: get the HTTP version --> <!-- TODO: get the HTTP version -->
<xsl:text>HTTP/1.0</xsl:text> <xsl:text>HTTP/1.0</xsl:text>
</line> </line>
<xsl:apply-templates select="header" mode="warc"/> <xsl:apply-templates select="header" mode="warc-http"/>
</xsl:template> </xsl:template>
<xsl:template match="response" mode="warc"> <xsl:template match="response" mode="warc-http">
<line> <line>
<!-- TODO: get the HTTP version and status--> <!-- TODO: get the HTTP version and status-->
<xsl:text>HTTP/1.1 </xsl:text> <xsl:text>HTTP/1.1 </xsl:text>
<xsl:value-of select="code"/> <xsl:value-of select="code"/>
<xsl:text> OK</xsl:text> <xsl:text> OK</xsl:text>
</line> </line>
<xsl:apply-templates select="header" mode="warc"/> <xsl:apply-templates select="header" mode="warc-http"/>
</xsl:template> </xsl:template>
<xsl:template match="header" mode="warc"> <xsl:template match="header" mode="warc-http">
<field> <field>
<name> <name>
<xsl:value-of select="@name"/> <xsl:value-of select="@name"/>
@ -62,7 +91,7 @@
</field> </field>
</xsl:template> </xsl:template>
<xsl:template match="text()" mode="warc"/> <xsl:template match="text()" mode="warc warc-http"/>
</xsl:stylesheet> </xsl:stylesheet>