Adding a mechanism to avoid to archive multiple times the same resource for a single archive set.

This commit is contained in:
Eric van der Vlist 2012-04-13 13:05:25 +02:00
parent cf97a98416
commit 3d18e9d8a4
1 changed files with 234 additions and 169 deletions

View File

@ -2,161 +2,222 @@
<p:param name="data" type="input"/> <p:param name="data" type="input"/>
<!-- Fetch the resource --> <!-- Look if the resource has already been archived for that set -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/action/@url"/>
</url>
<header>
<name>User-Agent</name>
<value>
<xsl:value-of select="doc('oxf:/config.xml')/config/user-agent"/>
</value>
</header>
<mode>archive</mode>
</config>
</p:input>
<p:output name="data" id="archive" debug="archive"/>
</p:processor>
<!-- Store the archive in the database -->
<p:processor name="oxf:pipeline"> <p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/> <p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data"> <p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0"> <config xsl:version="2.0">
<relpath> <relpath>
<xsl:value-of select="/action/@directory"/> <xsl:value-of select="/action/@directory"/>
<xsl:value-of select="/action/@filename"/> <xsl:text>index.xml</xsl:text>
</relpath> </relpath>
<operation>write</operation> <operation>read</operation>
<type>document</type> <type>xquery</type>
<parameter name="url" type="string">
<xsl:value-of select="/action/@url"/>
</parameter>
</config> </config>
</p:input> </p:input>
<p:input name="param" href="#archive"/> <p:input name="param">
<p:output name="data" id="response2"/> <xquery><![CDATA[
boolean(//archive[@url = $(url)])
]]></xquery>
</p:input>
<p:output name="data" id="duplicate" debug="duplicate"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer"> <p:choose href="#duplicate">
<p:input name="data" href="#response2"/>
</p:processor>
<p:when test="/*/* = 'true'">
<!-- Test the type of document to see if it needs to be rewritten --> <!-- Already archived, nothing to do -->
<p:choose href="#archive"> <!-- Update the queue -->
<p:processor name="oxf:pipeline">
<!-- HTML document : need to update the links... --> <p:input name="config" href="/data-access.xpl"/>
<p:when test="/archive/response/document/@content-type=('text/html', 'text/css')"> <p:input name="data" transform="oxf:xslt" href="#data">
<!-- Call the corresponding pipeline to extract the links and rewrite them -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#archive">
<config xsl:version="2.0"> <config xsl:version="2.0">
<url> <relpath>queue.xml</relpath>
<xsl:text>oxf:/actions/mediatypes/</xsl:text> <operation>write</operation>
<xsl:value-of select="substring-after(/archive/response/document/@content-type, '/')"/> <type>xquery</type>
<xsl:text>.xpl</xsl:text> <parameter name="uuid" type="string">
</url> <xsl:value-of select="/action/@uuid"/>
</parameter>
</config> </config>
</p:input> </p:input>
<p:output name="data" id="pipeline"/> <p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response4" debug="response"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/>
</p:processor>
</p:when>
<p:processor name="oxf:pipeline"> <p:otherwise>
<p:input name="config" href="#pipeline"/> <!-- Otherwise, archive the resource... -->
<p:input name="archive" href="#archive"/> <!-- Fetch the resource -->
<p:output name="rewritten" id="rewritten"/> <p:processor name="oxf:url-generator">
<p:output name="links" id="links"/> <p:input name="config" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/action/@url"/>
</url>
<header>
<name>User-Agent</name>
<value>
<xsl:value-of select="doc('oxf:/config.xml')/config/user-agent"/>
</value>
</header>
<mode>archive</mode>
</config>
</p:input>
<p:output name="data" id="archive" debug="archive"/>
</p:processor> </p:processor>
<!-- Store the rewritten document in the database --> <!-- Store the archive in the database -->
<p:processor name="oxf:pipeline"> <p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/> <p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data"> <p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0"> <config xsl:version="2.0">
<relpath> <relpath>
<xsl:value-of select="/action/@directory"/> <xsl:value-of select="/action/@directory"/>
<xsl:text>rewritten-</xsl:text>
<xsl:value-of select="/action/@filename"/> <xsl:value-of select="/action/@filename"/>
</relpath> </relpath>
<operation>write</operation> <operation>write</operation>
<type>document</type> <type>document</type>
</config> </config>
</p:input> </p:input>
<p:input name="param" href="#rewritten"/> <p:input name="param" href="#archive"/>
<p:output name="data" id="response3"/> <p:output name="data" id="response2"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer"> <p:processor name="oxf:null-serializer">
<p:input name="data" href="#response3"/> <p:input name="data" href="#response2"/>
</p:processor> </p:processor>
<!-- Test the type of document to see if it needs to be rewritten -->
<p:choose href="#archive">
<!-- Update the archive index --> <!-- HTML document : need to update the links... -->
<p:processor name="oxf:pipeline"> <p:when test="/archive/response/document/@content-type=('text/html', 'text/css')">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data"> <!-- Call the corresponding pipeline to extract the links and rewrite them -->
<config xsl:version="2.0"> <p:processor name="oxf:url-generator">
<relpath> <p:input name="config" transform="oxf:xslt" href="#archive">
<xsl:value-of select="/action/@directory"/> <config xsl:version="2.0">
<xsl:text>index.xml</xsl:text> <url>
</relpath> <xsl:text>oxf:/actions/mediatypes/</xsl:text>
<operation>write</operation> <xsl:value-of select="substring-after(/archive/response/document/@content-type, '/')"/>
<type>xquery</type> <xsl:text>.xpl</xsl:text>
<parameter name="url" type="string"> </url>
<xsl:value-of select="/action/@url"/> </config>
</parameter> </p:input>
<parameter name="filename" type="string"> <p:output name="data" id="pipeline"/>
<xsl:value-of select="/action/@filename"/> </p:processor>
</parameter>
<parameter name="filename-rewritten" type="string"> <p:processor name="oxf:pipeline">
<xsl:text>rewritten-</xsl:text> <p:input name="config" href="#pipeline"/>
<xsl:value-of select="/action/@filename"/> <p:input name="archive" href="#archive"/>
</parameter> <p:output name="rewritten" id="rewritten"/>
</config> <p:output name="links" id="links"/>
</p:input> </p:processor>
<p:input name="param">
<xquery><![CDATA[
<!-- Store the rewritten document in the database -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>rewritten-</xsl:text>
<xsl:value-of select="/action/@filename"/>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" href="#rewritten"/>
<p:output name="data" id="response3"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response3"/>
</p:processor>
<!-- Update the archive index -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="url" type="string">
<xsl:value-of select="/action/@url"/>
</parameter>
<parameter name="filename" type="string">
<xsl:value-of select="/action/@filename"/>
</parameter>
<parameter name="filename-rewritten" type="string">
<xsl:text>rewritten-</xsl:text>
<xsl:value-of select="/action/@filename"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $as in /archive-set for $as in /archive-set
return return
update update
insert <archive url=$(url) href=$(filename) href-rewritten=$(filename-rewritten) dateTime="{current-dateTime()}"/> insert <archive url=$(url) href=$(filename) href-rewritten=$(filename-rewritten) dateTime="{current-dateTime()}"/>
into $as into $as
]]></xquery> ]]></xquery>
</p:input> </p:input>
<p:output name="data" id="response1"/> <p:output name="data" id="response1"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer"> <p:processor name="oxf:null-serializer">
<p:input name="data" href="#response1"/> <p:input name="data" href="#response1"/>
</p:processor> </p:processor>
<!-- Update the queue --> <!-- Update the queue -->
<p:processor name="oxf:pipeline"> <p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/> <p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #links)"> <p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #links)">
<config xsl:version="2.0"> <config xsl:version="2.0">
<relpath>queue.xml</relpath> <relpath>queue.xml</relpath>
<operation>write</operation> <operation>write</operation>
<type>xquery</type> <type>xquery</type>
<parameter name="directory" type="string"> <parameter name="directory" type="string">
<xsl:value-of select="/root/action/@directory"/> <xsl:value-of select="/root/action/@directory"/>
</parameter> </parameter>
<parameter name="uuid" type="string"> <parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/> <xsl:value-of select="/root/action/@uuid"/>
</parameter> </parameter>
<parameter name="priority" type="string"> <parameter name="priority" type="string">
<xsl:value-of select="/root/action/@priority"/> <xsl:value-of select="/root/action/@priority"/>
</parameter> </parameter>
<parameter name="links" type="node-set"> <parameter name="links" type="node-set">
<xsl:copy-of select="/root/links"/> <xsl:copy-of select="/root/links"/>
</parameter> </parameter>
</config> </config>
</p:input> </p:input>
<p:input name="param"> <p:input name="param">
<xquery><![CDATA[ <xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util"; declare namespace util = "http://exist-db.org/xquery/util";
declare variable $links := $(links); declare variable $links := $(links);
@ -174,82 +235,86 @@ for $a in /queue/action where $a/@uuid = $(uuid) return
delete $a delete $a
]]></xquery> ]]></xquery>
</p:input> </p:input>
<p:output name="data" id="response4" debug="response"/> <p:output name="data" id="response4" debug="response"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer"> <p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/> <p:input name="data" href="#response4"/>
</p:processor> </p:processor>
</p:when> </p:when>
<!-- Otherwise: no need to rewrite --> <!-- Otherwise: no need to rewrite -->
<p:otherwise> <p:otherwise>
<!-- Update the archive index --> <!-- Update the archive index -->
<p:processor name="oxf:pipeline"> <p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/> <p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data"> <p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0"> <config xsl:version="2.0">
<relpath> <relpath>
<xsl:value-of select="/action/@directory"/> <xsl:value-of select="/action/@directory"/>
<xsl:text>index.xml</xsl:text> <xsl:text>index.xml</xsl:text>
</relpath> </relpath>
<operation>write</operation> <operation>write</operation>
<type>xquery</type> <type>xquery</type>
<parameter name="url" type="string"> <parameter name="url" type="string">
<xsl:value-of select="/action/@url"/> <xsl:value-of select="/action/@url"/>
</parameter> </parameter>
<parameter name="filename" type="string"> <parameter name="filename" type="string">
<xsl:value-of select="/action/@filename"/> <xsl:value-of select="/action/@filename"/>
</parameter> </parameter>
</config> </config>
</p:input> </p:input>
<p:input name="param"> <p:input name="param">
<xquery><![CDATA[ <xquery><![CDATA[
for $as in /archive-set for $as in /archive-set
return return
update update
insert <archive url=$(url) href=$(filename) dateTime="{current-dateTime()}"/> insert <archive url=$(url) href=$(filename) dateTime="{current-dateTime()}"/>
into $as into $as
]]></xquery> ]]></xquery>
</p:input> </p:input>
<p:output name="data" id="response1"/> <p:output name="data" id="response1"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer"> <p:processor name="oxf:null-serializer">
<p:input name="data" href="#response1"/> <p:input name="data" href="#response1"/>
</p:processor> </p:processor>
<!-- Update the queue -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
<!-- Update the queue -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return for $a in /queue/action where $a/@uuid = $(uuid) return
update update
delete $a delete $a
]]></xquery> ]]></xquery>
</p:input> </p:input>
<p:output name="data" id="response4" debug="response"/> <p:output name="data" id="response4" debug="response"/>
</p:processor> </p:processor>
<p:processor name="oxf:null-serializer"> <p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/> <p:input name="data" href="#response4"/>
</p:processor> </p:processor>
</p:otherwise>
</p:choose>
</p:otherwise> </p:otherwise>
</p:choose> </p:choose>
</p:config> </p:config>