From 675ed04abace92a528304466a1861b74947b92db Mon Sep 17 00:00:00 2001 From: Eric van der Vlist Date: Thu, 26 Apr 2012 17:08:28 +0200 Subject: [PATCH] Download and convert the crawl log --- .../pipelines/actions/get-heritrix-warc.xpl | 7 ++- .../actions/package-heritrix-warc.xpl | 53 +++++++++++++++---- archiver/pipelines/actions/parse-log.xslt | 51 ++++++++++++++++++ 3 files changed, 99 insertions(+), 12 deletions(-) create mode 100644 archiver/pipelines/actions/parse-log.xslt diff --git a/archiver/pipelines/actions/get-heritrix-warc.xpl b/archiver/pipelines/actions/get-heritrix-warc.xpl index e28089d..a44d344 100644 --- a/archiver/pipelines/actions/get-heritrix-warc.xpl +++ b/archiver/pipelines/actions/get-heritrix-warc.xpl @@ -55,7 +55,7 @@ - + queue.xml write @@ -78,6 +78,9 @@ + + + @@ -86,7 +89,7 @@ declare namespace util = "http://exist-db.org/xquery/util"; for $q in /queue return update - insert + insert into $q, for $a in /queue/action where $a/@uuid = $(uuid) return diff --git a/archiver/pipelines/actions/package-heritrix-warc.xpl b/archiver/pipelines/actions/package-heritrix-warc.xpl index b9a11e4..51dbea7 100644 --- a/archiver/pipelines/actions/package-heritrix-warc.xpl +++ b/archiver/pipelines/actions/package-heritrix-warc.xpl @@ -34,16 +34,49 @@ - - - - + + + + - - + + + + + + + + + text + + + + + + + + false + + + + + + + + + + + + + + + + + + @@ -83,7 +116,7 @@ - + @@ -91,10 +124,10 @@ - + - -