Rewriting links in HTML and CSS resources within WARC archives
This commit is contained in:
parent
5b162a64df
commit
9bce34f7c6
|
@ -0,0 +1,68 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="record" type="input"/>
|
||||
<p:param name="index-entry" type="input"/>
|
||||
<p:param name="index" type="input"/>
|
||||
<p:param name="rewritten" type="output"/>
|
||||
|
||||
|
||||
<!-- Store the document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>session</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
|
||||
<p:output name="data" id="url-written"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- And read it as CSS -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#url-written">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/*"/>
|
||||
</url>
|
||||
<content-type>text/css</content-type>
|
||||
<mode>text</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="css" debug="css"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Update the links -->
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#css"/>
|
||||
<p:input name="index-entry" href="#index-entry"/>
|
||||
<p:input name="index" href="#index"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="index" select="doc('input:index')/*"/>
|
||||
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
|
||||
<xsl:variable name="base" select="$resource/uri"/>
|
||||
<xsl:template match="/document">
|
||||
<xsl:copy>
|
||||
<xsl:copy-of select="@*"/>
|
||||
<xsl:analyze-string select="." regex="url\(["']?([^)'"]+)["']?\)" flags="">
|
||||
<xsl:matching-substring>
|
||||
<xsl:text>url(</xsl:text>
|
||||
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(regex-group(1), $base), '#'), '#')"/>
|
||||
<xsl:value-of select="$index/resource[(uri, same-as) = $abs]/local-name"/>
|
||||
<xsl:text>)</xsl:text>
|
||||
</xsl:matching-substring>
|
||||
<xsl:non-matching-substring>
|
||||
<xsl:copy-of select="."/>
|
||||
</xsl:non-matching-substring>
|
||||
</xsl:analyze-string>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" ref="rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
|
||||
</p:config>
|
|
@ -0,0 +1,80 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="record" type="input"/>
|
||||
<p:param name="index-entry" type="input"/>
|
||||
<p:param name="index" type="input"/>
|
||||
<p:param name="rewritten" type="output"/>
|
||||
|
||||
|
||||
<!-- Store the document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>session</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
|
||||
<p:output name="data" id="url-written"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- And read it as HTML -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#url-written">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/*"/>
|
||||
</url>
|
||||
<mode>html</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="html" debug="html"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Update the links -->
|
||||
<!-- TODO: support links in inline CSS -->
|
||||
<!-- TODO: support iframes and objects -->
|
||||
|
||||
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#html"/>
|
||||
<p:input name="index-entry" href="#index-entry"/>
|
||||
<p:input name="index" href="#index"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="index" select="doc('input:index')/*"/>
|
||||
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
|
||||
<xsl:variable name="base" select="$resource/uri"/>
|
||||
<xsl:template match="@*|node()">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
<xsl:template match="link[@rel='stylesheet']/@href|img/@src|script/@src">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(., $base), '#'), '#')"/>
|
||||
<xsl:value-of select="$index/resource[(uri, same-as) = $abs]/local-name"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
<xsl:template match="link[@rel!='stylesheet']/@href|a/@href">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:value-of select="resolve-uri(., $base)"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="html-rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:xml-converter">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<content-type>application/xml</content-type>
|
||||
<encoding>utf-8</encoding>
|
||||
<version>1.0</version>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#html-rewritten"/>
|
||||
<p:output name="data" ref="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
</p:config>
|
|
@ -93,12 +93,31 @@
|
|||
<p:output name="data" id="index-entry" debug="index-entry"/>
|
||||
</p:processor>
|
||||
<p:choose href="#index-entry">
|
||||
<p:when test="/entry/embeds">
|
||||
<p:when test="/resource/embeds">
|
||||
<!-- The resource has embedded content and must be rewritten -->
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="current()#xpointer(/record/content/document)"/>
|
||||
<p:output name="data" id="document"/>
|
||||
|
||||
<!-- Call the corresponding pipeline -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#index-entry">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:text>oxf:/actions/mediatypes/warc-</xsl:text>
|
||||
<xsl:value-of select="/resource/type"/>
|
||||
<xsl:text>.xpl</xsl:text>
|
||||
</url>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="pipeline"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="#pipeline"/>
|
||||
<p:input name="record" href="current()"/>
|
||||
<p:input name="index" href="#index"/>
|
||||
<p:input name="index-entry" href="#index-entry"/>
|
||||
<p:output name="rewritten" id="document" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
</p:when>
|
||||
<p:otherwise>
|
||||
<!-- The resource can be stored -->
|
||||
|
@ -123,10 +142,6 @@
|
|||
</p:processor>
|
||||
</p:for-each>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#loop" debug="loop"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
|
||||
<!-- Store the WARC in a temp file -->
|
||||
|
@ -141,30 +156,16 @@
|
|||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:zip">
|
||||
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc-location)">
|
||||
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc-location, #loop)">
|
||||
<files xsl:version="2.0" file-name="archive.zip">
|
||||
<file name="archive.warc">
|
||||
<xsl:value-of select="/root/url"/>
|
||||
</file>
|
||||
<!--<xsl:for-each select="/root/files/file[url]">
|
||||
<xsl:choose>
|
||||
<xsl:when test="position()=1">
|
||||
<!-\- TODO: support non HTML documents... -\->
|
||||
<file name="rewritten/index.html">
|
||||
<xsl:for-each select="/root/root/doc">
|
||||
<file name="rewritten/{resource/local-name}">
|
||||
<xsl:value-of select="url"/>
|
||||
</file>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:variable name="tokens" select="tokenize(archive/@url, '/')"/>
|
||||
<xsl:variable name="last-token" select="$tokens[last()]"/>
|
||||
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
|
||||
<xsl:variable name="extension" select="$tokens2[last()]"/>
|
||||
<file name="rewritten/{saxon:string-to-hexBinary(substring(archive/@url, 1, string-length(archive/@url) - string-length($extension) - 1), 'utf-8')}.{$extension}">
|
||||
<xsl:value-of select="url"/>
|
||||
</file>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:for-each>-->
|
||||
</xsl:for-each>
|
||||
</files>
|
||||
</p:input>
|
||||
<p:output name="data" id="zip"/>
|
||||
|
|
|
@ -58,6 +58,16 @@
|
|||
<local-name>
|
||||
<xsl:value-of select="owk:unique-local-name(.)"/>
|
||||
</local-name>
|
||||
<type>
|
||||
<xsl:choose>
|
||||
<xsl:when test="content-type = 'text/html'">html</xsl:when>
|
||||
<xsl:when test="content-type = 'application/xhtml+xml'">html</xsl:when>
|
||||
<xsl:when test="content-type = 'text/plain'">text</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="substring-after(content-type, '/')"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</type>
|
||||
<xsl:apply-templates select="." mode="redirect"/>
|
||||
<xsl:apply-templates select="/log/entry[referer = current()/uri and ends-with(discovery-path, 'E')]" mode="embedding"/>
|
||||
</resource>
|
||||
|
|
Loading…
Reference in New Issue