Handling payload content types

This commit is contained in:
Eric van der Vlist 2012-04-26 14:13:24 +02:00
parent be1a361ab9
commit 6f64c7f8a9
3 changed files with 80 additions and 4 deletions

View File

@ -19,12 +19,19 @@ import org.orbeon.oxf.pipeline.api.XMLReceiver;
import org.orbeon.oxf.processor.ProcessorImpl;
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
import org.orbeon.oxf.processor.ProcessorOutput;
import org.orbeon.oxf.processor.ProcessorUtils;
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
import org.orbeon.oxf.util.NetUtils;
import org.orbeon.oxf.xml.ContentHandlerHelper;
import org.orbeon.oxf.xml.XMLConstants;
import org.orbeon.oxf.xml.XMLUtils;
import org.owark.warc.*;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.AttributesImpl;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
/**
* This processor converts a WARC archive into an XML representation
@ -99,6 +106,28 @@ public class FromWarcConverter extends ProcessorImpl {
}
helper.endElement();
}
if (! content.endOfContent()) {
helper.startPrefixMapping("xsi", "http://www.w3.org/2001/XMLSchema-instance");
helper.startPrefixMapping("xs", "http://www.w3.org/2001/XMLSchema");
String contentType = content.getPayloadContentType();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "content-type", "content-type", "CDATA", contentType);
if (contentType.startsWith("text/") || contentType.matches(".*application/[^;]*xml.*")) {
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:string");
String encoding = content.getPayloadEncoding();
if (encoding == null) {
encoding = "utf-8";
}
helper.startElement(ProcessorUtils.DEFAULT_TEXT_DOCUMENT_ELEMENT, attributes);
XMLUtils.readerToCharacters(new InputStreamReader(content, encoding), xmlReceiver);
helper.endElement();
} else {
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:base64Binary");
helper.startElement(ProcessorUtils.DEFAULT_BINARY_DOCUMENT_ELEMENT, attributes);
XMLUtils.inputStreamToBase64Characters(new BufferedInputStream(content), xmlReceiver);
helper.endElement();
}
}
record.skipToEnd();
helper.endElement();
helper.endElement();

View File

@ -17,6 +17,8 @@ package org.owark.warc;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
@ -30,6 +32,7 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
private WarcRecord warcRecord;
private Exception e;
private String line;
private String payloadContentType;
public WarcRecordContent(WarcRecord warcRecord) {
this.warcRecord = warcRecord;
@ -61,6 +64,9 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
return null;
}
WarcField field = new WarcField(line);
if (field.getKey().equals("Content-Type")) {
this.payloadContentType = field.getValue();
}
line = null;
return field;
}
@ -94,7 +100,7 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
return isHTTP() && isRequest();
}
public Object endOfContent() {
public boolean endOfContent() {
return warcRecord.isLimitReached();
}
@ -106,6 +112,38 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
return warcRecord.getContentLength();
}
public String getPayloadContentType() {
String contentType = getPayloadContentHeader();
if (contentType != null && contentType.contains(";")) {
contentType = contentType.substring(0, contentType.indexOf(";"));
}
return contentType;
}
public String getPayloadContentHeader() {
String contentType = warcRecord.getContentType();
if (contentType.equals("application/warc-fields") || contentType.equals("application/http; msgtype=request")) {
return null;
}
if (contentType.equals("application/http; msgtype=response")) {
contentType = this.payloadContentType;
}
return contentType;
}
public String getPayloadEncoding() {
String contentType = getPayloadContentHeader();
if (contentType == null) {
return contentType;
}
Pattern pattern = Pattern.compile(".*;\\s*charset\\s*=\\s*([^;]+).*");
Matcher matcher = pattern.matcher(contentType);
if (matcher.matches()) {
return matcher.group(1).toLowerCase();
}
return null;
}
public class HttpStatusLine {

View File

@ -34,7 +34,7 @@ public class WarcParserTest {
WarcParser warcParser = new WarcParser(new FileInputStream(file));
Assert.assertEquals(true, warcParser.hasNext());
// RECORD
// RECORD (warcinfo)
WarcRecord record = warcParser.next();
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
@ -97,9 +97,12 @@ public class WarcParserTest {
Assert.assertEquals("http-header-user-agent", field.getKey());
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertNull(content.getPayloadContentType());
Assert.assertNull(content.getPayloadContentHeader());
Assert.assertNull(content.getPayloadEncoding());
Assert.assertEquals(true, content.endOfContent());
// Next record
// Next record (DNS response)
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
@ -134,9 +137,12 @@ public class WarcParserTest {
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
line = reader.readLine();
Assert.assertEquals(true, content.endOfContent());
Assert.assertEquals("text/dns", content.getPayloadContentType());
Assert.assertEquals("text/dns", content.getPayloadContentHeader());
Assert.assertNull(content.getPayloadEncoding());
Assert.assertNull(line);
// Next record
// Next record (HTTP response)
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
@ -190,6 +196,9 @@ public class WarcParserTest {
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
line = reader.readLine();
Assert.assertNull(line);
Assert.assertEquals("text/html", content.getPayloadContentType());
Assert.assertEquals("text/html;charset=utf-8", content.getPayloadContentHeader());
Assert.assertEquals("utf-8", content.getPayloadEncoding());
Assert.assertEquals(true, content.endOfContent());