Handling payload content types
This commit is contained in:
parent
be1a361ab9
commit
6f64c7f8a9
|
@ -19,12 +19,19 @@ import org.orbeon.oxf.pipeline.api.XMLReceiver;
|
||||||
import org.orbeon.oxf.processor.ProcessorImpl;
|
import org.orbeon.oxf.processor.ProcessorImpl;
|
||||||
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
|
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
|
||||||
import org.orbeon.oxf.processor.ProcessorOutput;
|
import org.orbeon.oxf.processor.ProcessorOutput;
|
||||||
|
import org.orbeon.oxf.processor.ProcessorUtils;
|
||||||
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
|
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
|
||||||
import org.orbeon.oxf.util.NetUtils;
|
import org.orbeon.oxf.util.NetUtils;
|
||||||
import org.orbeon.oxf.xml.ContentHandlerHelper;
|
import org.orbeon.oxf.xml.ContentHandlerHelper;
|
||||||
|
import org.orbeon.oxf.xml.XMLConstants;
|
||||||
|
import org.orbeon.oxf.xml.XMLUtils;
|
||||||
import org.owark.warc.*;
|
import org.owark.warc.*;
|
||||||
|
import org.xml.sax.Attributes;
|
||||||
|
import org.xml.sax.helpers.AttributesImpl;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This processor converts a WARC archive into an XML representation
|
* This processor converts a WARC archive into an XML representation
|
||||||
|
@ -99,6 +106,28 @@ public class FromWarcConverter extends ProcessorImpl {
|
||||||
}
|
}
|
||||||
helper.endElement();
|
helper.endElement();
|
||||||
}
|
}
|
||||||
|
if (! content.endOfContent()) {
|
||||||
|
helper.startPrefixMapping("xsi", "http://www.w3.org/2001/XMLSchema-instance");
|
||||||
|
helper.startPrefixMapping("xs", "http://www.w3.org/2001/XMLSchema");
|
||||||
|
String contentType = content.getPayloadContentType();
|
||||||
|
AttributesImpl attributes = new AttributesImpl();
|
||||||
|
attributes.addAttribute("", "content-type", "content-type", "CDATA", contentType);
|
||||||
|
if (contentType.startsWith("text/") || contentType.matches(".*application/[^;]*xml.*")) {
|
||||||
|
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:string");
|
||||||
|
String encoding = content.getPayloadEncoding();
|
||||||
|
if (encoding == null) {
|
||||||
|
encoding = "utf-8";
|
||||||
|
}
|
||||||
|
helper.startElement(ProcessorUtils.DEFAULT_TEXT_DOCUMENT_ELEMENT, attributes);
|
||||||
|
XMLUtils.readerToCharacters(new InputStreamReader(content, encoding), xmlReceiver);
|
||||||
|
helper.endElement();
|
||||||
|
} else {
|
||||||
|
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:base64Binary");
|
||||||
|
helper.startElement(ProcessorUtils.DEFAULT_BINARY_DOCUMENT_ELEMENT, attributes);
|
||||||
|
XMLUtils.inputStreamToBase64Characters(new BufferedInputStream(content), xmlReceiver);
|
||||||
|
helper.endElement();
|
||||||
|
}
|
||||||
|
}
|
||||||
record.skipToEnd();
|
record.skipToEnd();
|
||||||
helper.endElement();
|
helper.endElement();
|
||||||
helper.endElement();
|
helper.endElement();
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.owark.warc;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
@ -30,6 +32,7 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
|
||||||
private WarcRecord warcRecord;
|
private WarcRecord warcRecord;
|
||||||
private Exception e;
|
private Exception e;
|
||||||
private String line;
|
private String line;
|
||||||
|
private String payloadContentType;
|
||||||
|
|
||||||
public WarcRecordContent(WarcRecord warcRecord) {
|
public WarcRecordContent(WarcRecord warcRecord) {
|
||||||
this.warcRecord = warcRecord;
|
this.warcRecord = warcRecord;
|
||||||
|
@ -61,6 +64,9 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
WarcField field = new WarcField(line);
|
WarcField field = new WarcField(line);
|
||||||
|
if (field.getKey().equals("Content-Type")) {
|
||||||
|
this.payloadContentType = field.getValue();
|
||||||
|
}
|
||||||
line = null;
|
line = null;
|
||||||
return field;
|
return field;
|
||||||
}
|
}
|
||||||
|
@ -94,7 +100,7 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
|
||||||
return isHTTP() && isRequest();
|
return isHTTP() && isRequest();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object endOfContent() {
|
public boolean endOfContent() {
|
||||||
return warcRecord.isLimitReached();
|
return warcRecord.isLimitReached();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,6 +112,38 @@ public class WarcRecordContent extends InputStream implements Iterator<WarcField
|
||||||
return warcRecord.getContentLength();
|
return warcRecord.getContentLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getPayloadContentType() {
|
||||||
|
String contentType = getPayloadContentHeader();
|
||||||
|
if (contentType != null && contentType.contains(";")) {
|
||||||
|
contentType = contentType.substring(0, contentType.indexOf(";"));
|
||||||
|
}
|
||||||
|
return contentType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPayloadContentHeader() {
|
||||||
|
String contentType = warcRecord.getContentType();
|
||||||
|
if (contentType.equals("application/warc-fields") || contentType.equals("application/http; msgtype=request")) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (contentType.equals("application/http; msgtype=response")) {
|
||||||
|
contentType = this.payloadContentType;
|
||||||
|
}
|
||||||
|
return contentType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPayloadEncoding() {
|
||||||
|
String contentType = getPayloadContentHeader();
|
||||||
|
if (contentType == null) {
|
||||||
|
return contentType;
|
||||||
|
}
|
||||||
|
Pattern pattern = Pattern.compile(".*;\\s*charset\\s*=\\s*([^;]+).*");
|
||||||
|
Matcher matcher = pattern.matcher(contentType);
|
||||||
|
if (matcher.matches()) {
|
||||||
|
return matcher.group(1).toLowerCase();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public class HttpStatusLine {
|
public class HttpStatusLine {
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class WarcParserTest {
|
||||||
WarcParser warcParser = new WarcParser(new FileInputStream(file));
|
WarcParser warcParser = new WarcParser(new FileInputStream(file));
|
||||||
Assert.assertEquals(true, warcParser.hasNext());
|
Assert.assertEquals(true, warcParser.hasNext());
|
||||||
|
|
||||||
// RECORD
|
// RECORD (warcinfo)
|
||||||
|
|
||||||
WarcRecord record = warcParser.next();
|
WarcRecord record = warcParser.next();
|
||||||
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
|
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
|
||||||
|
@ -97,9 +97,12 @@ public class WarcParserTest {
|
||||||
Assert.assertEquals("http-header-user-agent", field.getKey());
|
Assert.assertEquals("http-header-user-agent", field.getKey());
|
||||||
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
|
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
|
||||||
Assert.assertEquals(false, content.hasNext());
|
Assert.assertEquals(false, content.hasNext());
|
||||||
|
Assert.assertNull(content.getPayloadContentType());
|
||||||
|
Assert.assertNull(content.getPayloadContentHeader());
|
||||||
|
Assert.assertNull(content.getPayloadEncoding());
|
||||||
Assert.assertEquals(true, content.endOfContent());
|
Assert.assertEquals(true, content.endOfContent());
|
||||||
|
|
||||||
// Next record
|
// Next record (DNS response)
|
||||||
|
|
||||||
Assert.assertEquals(true, warcParser.hasNext());
|
Assert.assertEquals(true, warcParser.hasNext());
|
||||||
record = warcParser.next();
|
record = warcParser.next();
|
||||||
|
@ -134,9 +137,12 @@ public class WarcParserTest {
|
||||||
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
|
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
Assert.assertEquals(true, content.endOfContent());
|
Assert.assertEquals(true, content.endOfContent());
|
||||||
|
Assert.assertEquals("text/dns", content.getPayloadContentType());
|
||||||
|
Assert.assertEquals("text/dns", content.getPayloadContentHeader());
|
||||||
|
Assert.assertNull(content.getPayloadEncoding());
|
||||||
Assert.assertNull(line);
|
Assert.assertNull(line);
|
||||||
|
|
||||||
// Next record
|
// Next record (HTTP response)
|
||||||
|
|
||||||
Assert.assertEquals(true, warcParser.hasNext());
|
Assert.assertEquals(true, warcParser.hasNext());
|
||||||
record = warcParser.next();
|
record = warcParser.next();
|
||||||
|
@ -190,6 +196,9 @@ public class WarcParserTest {
|
||||||
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
|
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
Assert.assertNull(line);
|
Assert.assertNull(line);
|
||||||
|
Assert.assertEquals("text/html", content.getPayloadContentType());
|
||||||
|
Assert.assertEquals("text/html;charset=utf-8", content.getPayloadContentHeader());
|
||||||
|
Assert.assertEquals("utf-8", content.getPayloadEncoding());
|
||||||
Assert.assertEquals(true, content.endOfContent());
|
Assert.assertEquals(true, content.endOfContent());
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue