147 lines
7.7 KiB
Java
147 lines
7.7 KiB
Java
/**
|
|
* Copyright (C) 2012 Eric van der Vlist.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
|
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
|
* 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
* See the GNU Lesser General Public License for more details.
|
|
*
|
|
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
|
*/
|
|
package org.owark.orbeon;
|
|
|
|
import org.apache.commons.fileupload.FileItem;
|
|
import org.orbeon.oxf.pipeline.api.PipelineContext;
|
|
import org.orbeon.oxf.pipeline.api.XMLReceiver;
|
|
import org.orbeon.oxf.processor.ProcessorImpl;
|
|
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
|
|
import org.orbeon.oxf.processor.ProcessorOutput;
|
|
import org.orbeon.oxf.processor.ProcessorUtils;
|
|
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
|
|
import org.orbeon.oxf.util.NetUtils;
|
|
import org.orbeon.oxf.xml.ContentHandlerHelper;
|
|
import org.orbeon.oxf.xml.XMLConstants;
|
|
import org.orbeon.oxf.xml.XMLUtils;
|
|
import org.owark.warc.*;
|
|
import org.xml.sax.Attributes;
|
|
import org.xml.sax.helpers.AttributesImpl;
|
|
|
|
import java.io.BufferedInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
|
|
/**
|
|
* This processor converts a WARC archive into an XML representation
|
|
*/
|
|
|
|
public class FromWarcConverter extends ProcessorImpl {
|
|
|
|
static public String WARC_ELEMENT_ROOT_NAME = "warc";
|
|
static public String RECORD_ELEMENT_NAME = "record";
|
|
static public String HEADERS_ELEMENT_NAME = "headers";
|
|
static public String HEADER_ELEMENT_NAME = "header";
|
|
static public String NAME_ATTRIBUTE_NAME = "name";
|
|
static public String CONTENT_ELEMENT_NAME = "content";
|
|
|
|
public FromWarcConverter() {
|
|
addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA));
|
|
addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA));
|
|
}
|
|
|
|
@Override
|
|
public ProcessorOutput createOutput(String outputName) {
|
|
final ProcessorOutput output = new ProcessorOutputImpl(FromWarcConverter.this,outputName) {
|
|
|
|
@Override
|
|
protected void readImpl(PipelineContext pipelineContext, XMLReceiver xmlReceiver) {
|
|
// Get FileItem
|
|
try {
|
|
ContentHandlerHelper helper = new ContentHandlerHelper(xmlReceiver);
|
|
helper.startDocument();
|
|
helper.startElement(WARC_ELEMENT_ROOT_NAME);
|
|
final FileItem fileItem = NetUtils.prepareFileItem(NetUtils.REQUEST_SCOPE);
|
|
// Read to OutputStream
|
|
readInputAsSAX(pipelineContext, INPUT_DATA, new BinaryTextXMLReceiver(null, fileItem.getOutputStream(), true, false, null, false, false, null, false));
|
|
// as an archive
|
|
final WarcParser warcParser = new WarcParser(fileItem.getInputStream());
|
|
while (warcParser.hasNext()) {
|
|
helper.startElement(RECORD_ELEMENT_NAME);
|
|
helper.startElement(HEADERS_ELEMENT_NAME);
|
|
WarcRecord record = warcParser.next();
|
|
WarcRecordHeader recordHeader = record.getHeader();
|
|
while (recordHeader.hasNext()) {
|
|
WarcField field = recordHeader.next();
|
|
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
|
|
helper.text(field.getValue());
|
|
helper.endElement();
|
|
}
|
|
helper.endElement();
|
|
helper.startElement(CONTENT_ELEMENT_NAME);
|
|
WarcRecordContent content = record.getContent();
|
|
if (content.hasRequestLine()) {
|
|
helper.startElement("request");
|
|
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
|
|
helper.element("method", request.getMethod());
|
|
helper.element("uri", request.getUri());
|
|
helper.element("version", request.getVersion());
|
|
helper.endElement();
|
|
} else if (content.hasStatusLine()) {
|
|
helper.startElement("status");
|
|
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
|
|
helper.element("version", status.getVersion());
|
|
helper.element("status", status.getStatus());
|
|
helper.element("reason", status.getReason());
|
|
helper.endElement();
|
|
}
|
|
if (content.hasFields()) {
|
|
helper.startElement(HEADERS_ELEMENT_NAME);
|
|
while (content.hasNext()) {
|
|
WarcField field = content.next();
|
|
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
|
|
helper.text(field.getValue());
|
|
helper.endElement();
|
|
}
|
|
helper.endElement();
|
|
}
|
|
if (! content.endOfContent()) {
|
|
helper.startPrefixMapping("xsi", "http://www.w3.org/2001/XMLSchema-instance");
|
|
helper.startPrefixMapping("xs", "http://www.w3.org/2001/XMLSchema");
|
|
String contentType = content.getPayloadContentType();
|
|
AttributesImpl attributes = new AttributesImpl();
|
|
attributes.addAttribute("", "content-type", "content-type", "CDATA", contentType);
|
|
if (contentType.startsWith("text/") || contentType.matches(".*application/[^;]*xml.*")) {
|
|
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:string");
|
|
String encoding = content.getPayloadEncoding();
|
|
if (encoding == null) {
|
|
encoding = "utf-8";
|
|
}
|
|
helper.startElement(ProcessorUtils.DEFAULT_TEXT_DOCUMENT_ELEMENT, attributes);
|
|
XMLUtils.readerToCharacters(new InputStreamReader(content, encoding), xmlReceiver);
|
|
helper.endElement();
|
|
} else {
|
|
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:base64Binary");
|
|
helper.startElement(ProcessorUtils.DEFAULT_BINARY_DOCUMENT_ELEMENT, attributes);
|
|
XMLUtils.inputStreamToBase64Characters(new BufferedInputStream(content), xmlReceiver);
|
|
helper.endElement();
|
|
}
|
|
}
|
|
record.skipToEnd();
|
|
helper.endElement();
|
|
helper.endElement();
|
|
}
|
|
helper.endElement();
|
|
helper.endDocument();
|
|
} catch (Exception e) {
|
|
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
|
}
|
|
|
|
}
|
|
};
|
|
addOutput(outputName, output);
|
|
return output;
|
|
}
|
|
}
|