Still work in progress, but the WARC archive now validates with warc-tools' warcvalid.py...

This commit is contained in:
Eric van der Vlist 2012-04-15 00:12:29 +02:00
parent ba51ddfb0b
commit ad35672603
2 changed files with 12 additions and 4 deletions

View File

@ -211,7 +211,9 @@ conformsTo:
</xsl:variable> </xsl:variable>
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain"> <document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
<xsl:apply-templates select="$request" mode="warc"/> <xsl:apply-templates select="$request" mode="warc"/>
<xsl:apply-templates select="$response" mode="warc"/> <xsl:apply-templates select="$response" mode="warc">
<xsl:with-param name="document-length" as="xs:integer" select="string-length(translate(/archive/response/document, ' &#xa;&#xd;', '')) * 3 div 4" tunnel="yes"/>
</xsl:apply-templates>
</document> </document>
</xsl:template> </xsl:template>
</xsl:stylesheet> </xsl:stylesheet>

View File

@ -30,6 +30,9 @@
</xsl:template> </xsl:template>
<xsl:template match="record" mode="warc"> <xsl:template match="record" mode="warc">
<xsl:param name="document-length" as="xs:integer" select="0" tunnel="yes"/>
<xsl:call-template name="CRLF"/>
<xsl:call-template name="CRLF"/>
<xsl:apply-templates select="header" mode="warc"/> <xsl:apply-templates select="header" mode="warc"/>
<xsl:variable name="block"> <xsl:variable name="block">
<xsl:apply-templates select="block" mode="warc"/> <xsl:apply-templates select="block" mode="warc"/>
@ -38,15 +41,13 @@
<field> <field>
<name>Content-Length</name> <name>Content-Length</name>
<value> <value>
<xsl:value-of select="string-length($block)"/> <xsl:value-of select="string-length($block) + $document-length "/>
</value> </value>
</field> </field>
</xsl:variable> </xsl:variable>
<xsl:apply-templates select="$content-length" mode="warc"/> <xsl:apply-templates select="$content-length" mode="warc"/>
<xsl:call-template name="CRLF"/> <xsl:call-template name="CRLF"/>
<xsl:value-of select="$block"/> <xsl:value-of select="$block"/>
<xsl:call-template name="CRLF"/>
<xsl:call-template name="CRLF"/>
</xsl:template> </xsl:template>
<xsl:template match="block" mode="warc"> <xsl:template match="block" mode="warc">
@ -71,6 +72,11 @@
</xsl:template> </xsl:template>
<xsl:template match="response" mode="warc-http"> <xsl:template match="response" mode="warc-http">
<!--<xsl:message>
<xsl:value-of select="string-length(document)"/>
<xsl:text> - </xsl:text>
<xsl:value-of select="string-length(translate(document, ' &#xa;&#xd;', ''))"/>
</xsl:message>-->
<line> <line>
<!-- TODO: get the HTTP version and status--> <!-- TODO: get the HTTP version and status-->
<xsl:text>HTTP/1.1 </xsl:text> <xsl:text>HTTP/1.1 </xsl:text>