From 9bce34f7c6debb03e9419e0145f114eb4817a493 Mon Sep 17 00:00:00 2001 From: Eric van der Vlist Date: Fri, 27 Apr 2012 18:29:15 +0200 Subject: [PATCH] Rewriting links in HTML and CSS resources within WARC archives --- .../pipelines/actions/mediatypes/warc-css.xpl | 68 ++++++++++++++++ .../actions/mediatypes/warc-html.xpl | 80 +++++++++++++++++++ .../actions/package-heritrix-warc.xpl | 57 ++++++------- .../pipelines/actions/resource-index.xslt | 10 +++ 4 files changed, 187 insertions(+), 28 deletions(-) create mode 100644 archiver/pipelines/actions/mediatypes/warc-css.xpl create mode 100644 archiver/pipelines/actions/mediatypes/warc-html.xpl diff --git a/archiver/pipelines/actions/mediatypes/warc-css.xpl b/archiver/pipelines/actions/mediatypes/warc-css.xpl new file mode 100644 index 0000000..dde3f50 --- /dev/null +++ b/archiver/pipelines/actions/mediatypes/warc-css.xpl @@ -0,0 +1,68 @@ + + + + + + + + + + + + + session + + + + + + + + + + + + + + text/css + text + + + + + + + + + + + + + + + + + + + + + + url( + + + ) + + + + + + + + + + + + + + + diff --git a/archiver/pipelines/actions/mediatypes/warc-html.xpl b/archiver/pipelines/actions/mediatypes/warc-html.xpl new file mode 100644 index 0000000..7d30e57 --- /dev/null +++ b/archiver/pipelines/actions/mediatypes/warc-html.xpl @@ -0,0 +1,80 @@ + + + + + + + + + + + + + session + + + + + + + + + + + + + + html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + application/xml + utf-8 + 1.0 + + + + + + + diff --git a/archiver/pipelines/actions/package-heritrix-warc.xpl b/archiver/pipelines/actions/package-heritrix-warc.xpl index e999dad..194cd3d 100644 --- a/archiver/pipelines/actions/package-heritrix-warc.xpl +++ b/archiver/pipelines/actions/package-heritrix-warc.xpl @@ -93,12 +93,31 @@ - + - - - + + + + + + + oxf:/actions/mediatypes/warc- + + .xpl + + + + + + + + + + + + + @@ -123,10 +142,6 @@ - - - - @@ -141,30 +156,16 @@ - + - + + + + + diff --git a/archiver/pipelines/actions/resource-index.xslt b/archiver/pipelines/actions/resource-index.xslt index a6dadc9..82d7036 100644 --- a/archiver/pipelines/actions/resource-index.xslt +++ b/archiver/pipelines/actions/resource-index.xslt @@ -58,6 +58,16 @@ + + + html + html + text + + + + +