/** * Copyright (C) 2012 Eric van der Vlist. * * This program is free software; you can redistribute it and/or modify it under the terms of the * GNU Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The full text of the license is available at http://www.gnu.org/copyleft/lesser.html */ package org.owark.orbeon; import org.apache.commons.fileupload.FileItem; import org.orbeon.oxf.pipeline.api.PipelineContext; import org.orbeon.oxf.pipeline.api.XMLReceiver; import org.orbeon.oxf.processor.ProcessorImpl; import org.orbeon.oxf.processor.ProcessorInputOutputInfo; import org.orbeon.oxf.processor.ProcessorOutput; import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver; import org.orbeon.oxf.util.NetUtils; import org.orbeon.oxf.xml.ContentHandlerHelper; import org.owark.warc.*; import java.io.IOException; /** * This processor converts a WARC archive into an XML representation */ public class FromWarcConverter extends ProcessorImpl { static public String WARC_ELEMENT_ROOT_NAME = "warc"; static public String RECORD_ELEMENT_NAME = "record"; static public String HEADERS_ELEMENT_NAME = "headers"; static public String HEADER_ELEMENT_NAME = "header"; static public String NAME_ATTRIBUTE_NAME = "name"; static public String CONTENT_ELEMENT_NAME = "content"; public FromWarcConverter() { addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA)); addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA)); } @Override public ProcessorOutput createOutput(String outputName) { final ProcessorOutput output = new ProcessorOutputImpl(FromWarcConverter.this,outputName) { @Override protected void readImpl(PipelineContext pipelineContext, XMLReceiver xmlReceiver) { // Get FileItem try { ContentHandlerHelper helper = new ContentHandlerHelper(xmlReceiver); helper.startDocument(); helper.startElement(WARC_ELEMENT_ROOT_NAME); final FileItem fileItem = NetUtils.prepareFileItem(NetUtils.REQUEST_SCOPE); // Read to OutputStream readInputAsSAX(pipelineContext, INPUT_DATA, new BinaryTextXMLReceiver(null, fileItem.getOutputStream(), true, false, null, false, false, null, false)); // as an archive final WarcParser warcParser = new WarcParser(fileItem.getInputStream()); while (warcParser.hasNext()) { helper.startElement(RECORD_ELEMENT_NAME); helper.startElement(HEADERS_ELEMENT_NAME); WarcRecord record = warcParser.next(); WarcRecordHeader recordHeader = record.getHeader(); while (recordHeader.hasNext()) { WarcField field = recordHeader.next(); helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()}); helper.text(field.getValue()); helper.endElement(); } helper.endElement(); helper.startElement(CONTENT_ELEMENT_NAME); WarcRecordContent content = record.getContent(); if (content.hasRequestLine()) { helper.startElement("request"); WarcRecordContent.HttpRequestLine request = content.getRequestLine(); helper.element("method", request.getMethod()); helper.element("uri", request.getUri()); helper.element("version", request.getVersion()); helper.endElement(); } else if (content.hasStatusLine()) { helper.startElement("status"); WarcRecordContent.HttpStatusLine status = content.getStatusLine(); helper.element("version", status.getVersion()); helper.element("status", status.getStatus()); helper.element("reason", status.getReason()); helper.endElement(); } if (content.hasFields()) { helper.startElement(HEADERS_ELEMENT_NAME); while (content.hasNext()) { WarcField field = content.next(); helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()}); helper.text(field.getValue()); helper.endElement(); } helper.endElement(); } record.skipToEnd(); helper.endElement(); helper.endElement(); } helper.endElement(); helper.endDocument(); } catch (Exception e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } } }; addOutput(outputName, output); return output; } }