From 6f64c7f8a97f959f8389960c466fe54245ea83b9 Mon Sep 17 00:00:00 2001 From: Eric van der Vlist Date: Thu, 26 Apr 2012 14:13:24 +0200 Subject: [PATCH] Handling payload content types --- .../org/owark/orbeon/FromWarcConverter.java | 29 ++++++++++++++ .../src/org/owark/warc/WarcRecordContent.java | 40 ++++++++++++++++++- .../test/org/owark/warc/WarcParserTest.java | 15 +++++-- 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/archiver/java/src/org/owark/orbeon/FromWarcConverter.java b/archiver/java/src/org/owark/orbeon/FromWarcConverter.java index ef3b672..493d98b 100644 --- a/archiver/java/src/org/owark/orbeon/FromWarcConverter.java +++ b/archiver/java/src/org/owark/orbeon/FromWarcConverter.java @@ -19,12 +19,19 @@ import org.orbeon.oxf.pipeline.api.XMLReceiver; import org.orbeon.oxf.processor.ProcessorImpl; import org.orbeon.oxf.processor.ProcessorInputOutputInfo; import org.orbeon.oxf.processor.ProcessorOutput; +import org.orbeon.oxf.processor.ProcessorUtils; import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver; import org.orbeon.oxf.util.NetUtils; import org.orbeon.oxf.xml.ContentHandlerHelper; +import org.orbeon.oxf.xml.XMLConstants; +import org.orbeon.oxf.xml.XMLUtils; import org.owark.warc.*; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.AttributesImpl; +import java.io.BufferedInputStream; import java.io.IOException; +import java.io.InputStreamReader; /** * This processor converts a WARC archive into an XML representation @@ -99,6 +106,28 @@ public class FromWarcConverter extends ProcessorImpl { } helper.endElement(); } + if (! content.endOfContent()) { + helper.startPrefixMapping("xsi", "http://www.w3.org/2001/XMLSchema-instance"); + helper.startPrefixMapping("xs", "http://www.w3.org/2001/XMLSchema"); + String contentType = content.getPayloadContentType(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "content-type", "content-type", "CDATA", contentType); + if (contentType.startsWith("text/") || contentType.matches(".*application/[^;]*xml.*")) { + attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:string"); + String encoding = content.getPayloadEncoding(); + if (encoding == null) { + encoding = "utf-8"; + } + helper.startElement(ProcessorUtils.DEFAULT_TEXT_DOCUMENT_ELEMENT, attributes); + XMLUtils.readerToCharacters(new InputStreamReader(content, encoding), xmlReceiver); + helper.endElement(); + } else { + attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:base64Binary"); + helper.startElement(ProcessorUtils.DEFAULT_BINARY_DOCUMENT_ELEMENT, attributes); + XMLUtils.inputStreamToBase64Characters(new BufferedInputStream(content), xmlReceiver); + helper.endElement(); + } + } record.skipToEnd(); helper.endElement(); helper.endElement(); diff --git a/archiver/java/src/org/owark/warc/WarcRecordContent.java b/archiver/java/src/org/owark/warc/WarcRecordContent.java index eb60634..ec0b841 100644 --- a/archiver/java/src/org/owark/warc/WarcRecordContent.java +++ b/archiver/java/src/org/owark/warc/WarcRecordContent.java @@ -17,6 +17,8 @@ package org.owark.warc; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Created by IntelliJ IDEA. @@ -30,6 +32,7 @@ public class WarcRecordContent extends InputStream implements IteratorApache Tomcat/6.0.24 - Rapport d'erreur", line.substring(0, line.indexOf("