Rewriting links in HTML and CSS resources within WARC archives

This commit is contained in:
Eric van der Vlist 2012-04-27 18:29:15 +02:00
parent 5b162a64df
commit 9bce34f7c6
4 changed files with 187 additions and 28 deletions

View File

@ -0,0 +1,68 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="record" type="input"/>
<p:param name="index-entry" type="input"/>
<p:param name="index" type="input"/>
<p:param name="rewritten" type="output"/>
<!-- Store the document -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>session</scope>
</config>
</p:input>
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
<p:output name="data" id="url-written"/>
</p:processor>
<!-- And read it as CSS -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#url-written">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/*"/>
</url>
<content-type>text/css</content-type>
<mode>text</mode>
</config>
</p:input>
<p:output name="data" id="css" debug="css"/>
</p:processor>
<!-- Update the links -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#css"/>
<p:input name="index-entry" href="#index-entry"/>
<p:input name="index" href="#index"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="index" select="doc('input:index')/*"/>
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
<xsl:variable name="base" select="$resource/uri"/>
<xsl:template match="/document">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:analyze-string select="." regex="url\([&quot;']?([^)'&quot;]+)[&quot;']?\)" flags="">
<xsl:matching-substring>
<xsl:text>url(</xsl:text>
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(regex-group(1), $base), '#'), '#')"/>
<xsl:value-of select="$index/resource[(uri, same-as) = $abs]/local-name"/>
<xsl:text>)</xsl:text>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:copy-of select="."/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" ref="rewritten" debug="rewritten"/>
</p:processor>
</p:config>

View File

@ -0,0 +1,80 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="record" type="input"/>
<p:param name="index-entry" type="input"/>
<p:param name="index" type="input"/>
<p:param name="rewritten" type="output"/>
<!-- Store the document -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>session</scope>
</config>
</p:input>
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
<p:output name="data" id="url-written"/>
</p:processor>
<!-- And read it as HTML -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#url-written">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/*"/>
</url>
<mode>html</mode>
</config>
</p:input>
<p:output name="data" id="html" debug="html"/>
</p:processor>
<!-- Update the links -->
<!-- TODO: support links in inline CSS -->
<!-- TODO: support iframes and objects -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#html"/>
<p:input name="index-entry" href="#index-entry"/>
<p:input name="index" href="#index"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="index" select="doc('input:index')/*"/>
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
<xsl:variable name="base" select="$resource/uri"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="link[@rel='stylesheet']/@href|img/@src|script/@src">
<xsl:attribute name="{name(.)}">
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(., $base), '#'), '#')"/>
<xsl:value-of select="$index/resource[(uri, same-as) = $abs]/local-name"/>
</xsl:attribute>
</xsl:template>
<xsl:template match="link[@rel!='stylesheet']/@href|a/@href">
<xsl:attribute name="{name(.)}">
<xsl:value-of select="resolve-uri(., $base)"/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="html-rewritten" debug="rewritten"/>
</p:processor>
<p:processor name="oxf:xml-converter">
<p:input name="config">
<config>
<content-type>application/xml</content-type>
<encoding>utf-8</encoding>
<version>1.0</version>
</config>
</p:input>
<p:input name="data" href="#html-rewritten"/>
<p:output name="data" ref="rewritten"/>
</p:processor>
</p:config>

View File

@ -93,12 +93,31 @@
<p:output name="data" id="index-entry" debug="index-entry"/>
</p:processor>
<p:choose href="#index-entry">
<p:when test="/entry/embeds">
<p:when test="/resource/embeds">
<!-- The resource has embedded content and must be rewritten -->
<p:processor name="oxf:identity">
<p:input name="data" href="current()#xpointer(/record/content/document)"/>
<p:output name="data" id="document"/>
<!-- Call the corresponding pipeline -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#index-entry">
<config xsl:version="2.0">
<url>
<xsl:text>oxf:/actions/mediatypes/warc-</xsl:text>
<xsl:value-of select="/resource/type"/>
<xsl:text>.xpl</xsl:text>
</url>
</config>
</p:input>
<p:output name="data" id="pipeline"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="#pipeline"/>
<p:input name="record" href="current()"/>
<p:input name="index" href="#index"/>
<p:input name="index-entry" href="#index-entry"/>
<p:output name="rewritten" id="document" debug="rewritten"/>
</p:processor>
</p:when>
<p:otherwise>
<!-- The resource can be stored -->
@ -123,10 +142,6 @@
</p:processor>
</p:for-each>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#loop" debug="loop"/>
</p:processor>
<!-- Store the WARC in a temp file -->
@ -141,30 +156,16 @@
</p:processor>
<p:processor name="oxf:zip">
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc-location)">
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc-location, #loop)">
<files xsl:version="2.0" file-name="archive.zip">
<file name="archive.warc">
<xsl:value-of select="/root/url"/>
</file>
<!--<xsl:for-each select="/root/files/file[url]">
<xsl:choose>
<xsl:when test="position()=1">
<!-\- TODO: support non HTML documents... -\->
<file name="rewritten/index.html">
<xsl:value-of select="url"/>
</file>
</xsl:when>
<xsl:otherwise>
<xsl:variable name="tokens" select="tokenize(archive/@url, '/')"/>
<xsl:variable name="last-token" select="$tokens[last()]"/>
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
<xsl:variable name="extension" select="$tokens2[last()]"/>
<file name="rewritten/{saxon:string-to-hexBinary(substring(archive/@url, 1, string-length(archive/@url) - string-length($extension) - 1), 'utf-8')}.{$extension}">
<xsl:value-of select="url"/>
</file>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>-->
<xsl:for-each select="/root/root/doc">
<file name="rewritten/{resource/local-name}">
<xsl:value-of select="url"/>
</file>
</xsl:for-each>
</files>
</p:input>
<p:output name="data" id="zip"/>

View File

@ -58,6 +58,16 @@
<local-name>
<xsl:value-of select="owk:unique-local-name(.)"/>
</local-name>
<type>
<xsl:choose>
<xsl:when test="content-type = 'text/html'">html</xsl:when>
<xsl:when test="content-type = 'application/xhtml+xml'">html</xsl:when>
<xsl:when test="content-type = 'text/plain'">text</xsl:when>
<xsl:otherwise>
<xsl:value-of select="substring-after(content-type, '/')"/>
</xsl:otherwise>
</xsl:choose>
</type>
<xsl:apply-templates select="." mode="redirect"/>
<xsl:apply-templates select="/log/entry[referer = current()/uri and ends-with(discovery-path, 'E')]" mode="embedding"/>
</resource>