Implementing yet another WARC parser (the heritrix one didn't work well with Orbeon due to http client library conflicts).
This commit is contained in:
parent
307b6d2a72
commit
be1a361ab9
|
@ -0,0 +1,45 @@
|
|||
<project name="owark" default="dist" basedir=".">
|
||||
<description>
|
||||
Owark build file
|
||||
</description>
|
||||
<!-- set global properties for this build -->
|
||||
<property name="src" location="java/src"/>
|
||||
<property name="build" location="build"/>
|
||||
<property name="dist" location="dist"/>
|
||||
|
||||
<target name="init">
|
||||
<!-- Create the time stamp -->
|
||||
<tstamp/>
|
||||
<!-- Create the build directory structure used by compile -->
|
||||
<mkdir dir="${build}"/>
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="init"
|
||||
description="compile the source " >
|
||||
<!-- Compile the java code from ${src} into ${build} -->
|
||||
<javac srcdir="${src}" destdir="${build}">
|
||||
<classpath>
|
||||
<pathelement location="java/lib/heritrix-commons-3.1.0.jar"/>
|
||||
<pathelement location="java/lib/archive-overlay-commons-httpclient-3.1.jar"/>
|
||||
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/commons-fileupload-1.2.2.jar"/>
|
||||
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/orbeon.jar"/>
|
||||
</classpath>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="compile"
|
||||
description="generate the distribution" >
|
||||
<!-- Create the distribution directory -->
|
||||
<mkdir dir="${dist}/lib"/>
|
||||
|
||||
<!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
|
||||
<jar jarfile="${dist}/lib/owark.jar" basedir="${build}"/>
|
||||
</target>
|
||||
|
||||
<target name="clean"
|
||||
description="clean up" >
|
||||
<!-- Delete the ${build} and ${dist} directory trees -->
|
||||
<delete dir="${build}"/>
|
||||
<delete dir="${dist}"/>
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,117 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.orbeon;
|
||||
|
||||
import org.apache.commons.fileupload.FileItem;
|
||||
import org.orbeon.oxf.pipeline.api.PipelineContext;
|
||||
import org.orbeon.oxf.pipeline.api.XMLReceiver;
|
||||
import org.orbeon.oxf.processor.ProcessorImpl;
|
||||
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
|
||||
import org.orbeon.oxf.processor.ProcessorOutput;
|
||||
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
|
||||
import org.orbeon.oxf.util.NetUtils;
|
||||
import org.orbeon.oxf.xml.ContentHandlerHelper;
|
||||
import org.owark.warc.*;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* This processor converts a WARC archive into an XML representation
|
||||
*/
|
||||
|
||||
public class FromWarcConverter extends ProcessorImpl {
|
||||
|
||||
static public String WARC_ELEMENT_ROOT_NAME = "warc";
|
||||
static public String RECORD_ELEMENT_NAME = "record";
|
||||
static public String HEADERS_ELEMENT_NAME = "headers";
|
||||
static public String HEADER_ELEMENT_NAME = "header";
|
||||
static public String NAME_ATTRIBUTE_NAME = "name";
|
||||
static public String CONTENT_ELEMENT_NAME = "content";
|
||||
|
||||
public FromWarcConverter() {
|
||||
addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA));
|
||||
addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA));
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProcessorOutput createOutput(String outputName) {
|
||||
final ProcessorOutput output = new ProcessorOutputImpl(FromWarcConverter.this,outputName) {
|
||||
|
||||
@Override
|
||||
protected void readImpl(PipelineContext pipelineContext, XMLReceiver xmlReceiver) {
|
||||
// Get FileItem
|
||||
try {
|
||||
ContentHandlerHelper helper = new ContentHandlerHelper(xmlReceiver);
|
||||
helper.startDocument();
|
||||
helper.startElement(WARC_ELEMENT_ROOT_NAME);
|
||||
final FileItem fileItem = NetUtils.prepareFileItem(NetUtils.REQUEST_SCOPE);
|
||||
// Read to OutputStream
|
||||
readInputAsSAX(pipelineContext, INPUT_DATA, new BinaryTextXMLReceiver(null, fileItem.getOutputStream(), true, false, null, false, false, null, false));
|
||||
// as an archive
|
||||
final WarcParser warcParser = new WarcParser(fileItem.getInputStream());
|
||||
while (warcParser.hasNext()) {
|
||||
helper.startElement(RECORD_ELEMENT_NAME);
|
||||
helper.startElement(HEADERS_ELEMENT_NAME);
|
||||
WarcRecord record = warcParser.next();
|
||||
WarcRecordHeader recordHeader = record.getHeader();
|
||||
while (recordHeader.hasNext()) {
|
||||
WarcField field = recordHeader.next();
|
||||
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
|
||||
helper.text(field.getValue());
|
||||
helper.endElement();
|
||||
}
|
||||
helper.endElement();
|
||||
helper.startElement(CONTENT_ELEMENT_NAME);
|
||||
WarcRecordContent content = record.getContent();
|
||||
if (content.hasRequestLine()) {
|
||||
helper.startElement("request");
|
||||
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
|
||||
helper.element("method", request.getMethod());
|
||||
helper.element("uri", request.getUri());
|
||||
helper.element("version", request.getVersion());
|
||||
helper.endElement();
|
||||
} else if (content.hasStatusLine()) {
|
||||
helper.startElement("status");
|
||||
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
|
||||
helper.element("version", status.getVersion());
|
||||
helper.element("status", status.getStatus());
|
||||
helper.element("reason", status.getReason());
|
||||
helper.endElement();
|
||||
}
|
||||
if (content.hasFields()) {
|
||||
helper.startElement(HEADERS_ELEMENT_NAME);
|
||||
while (content.hasNext()) {
|
||||
WarcField field = content.next();
|
||||
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
|
||||
helper.text(field.getValue());
|
||||
helper.endElement();
|
||||
}
|
||||
helper.endElement();
|
||||
}
|
||||
record.skipToEnd();
|
||||
helper.endElement();
|
||||
helper.endElement();
|
||||
}
|
||||
helper.endElement();
|
||||
helper.endDocument();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
addOutput(outputName, output);
|
||||
return output;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 17:56:22
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcField {
|
||||
|
||||
|
||||
private String line;
|
||||
private String key;
|
||||
private String value;
|
||||
|
||||
public WarcField(String line) {
|
||||
this.line = line;
|
||||
int sep = line.indexOf(":");
|
||||
this.key = line.substring(0, sep).trim();
|
||||
this.value = line.substring(sep + 1).trim();
|
||||
}
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Read WARC files
|
||||
*/
|
||||
public class WarcParser implements Iterator<WarcRecord> {
|
||||
|
||||
public static int BUFFER_SIZE = 1024;
|
||||
public static String CRLF = "\r\n";
|
||||
public static String CRLFCRLF = CRLF + CRLF;
|
||||
public static String MAGIC = "WARC/";
|
||||
private InputStream is;
|
||||
private byte[] buffer = new byte[BUFFER_SIZE];
|
||||
private int index = 0;
|
||||
private int limit = -1;
|
||||
private String magic;
|
||||
private int recordCount;
|
||||
|
||||
|
||||
public WarcParser(InputStream is) {
|
||||
this.is = is;
|
||||
resetBuffer();
|
||||
}
|
||||
|
||||
public String getMagic() throws IOException, WarcException {
|
||||
return this.magic;
|
||||
}
|
||||
|
||||
private void resetBuffer() {
|
||||
index = 0;
|
||||
}
|
||||
|
||||
private void readUntil(String stringPattern) throws IOException, WarcException {
|
||||
boolean matches = true;
|
||||
for (int i=0; i< stringPattern.length() && limit != 0; i++) {
|
||||
int c = read();
|
||||
buffer[index ++] = (byte) c;
|
||||
if (stringPattern.codePointAt(i) != c) {
|
||||
matches = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (matches) {
|
||||
return;
|
||||
}
|
||||
readUntil(stringPattern);
|
||||
}
|
||||
|
||||
protected String readLine() throws IOException, WarcException {
|
||||
readUntil(CRLF);
|
||||
String line = new String(buffer, 0, index - CRLF.length(), "UTF-8");
|
||||
resetBuffer();
|
||||
return line;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
limit = -1;
|
||||
do {
|
||||
try {
|
||||
magic = readLine();
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
} while (! magic.startsWith(MAGIC));
|
||||
return true;
|
||||
}
|
||||
|
||||
public WarcRecord next() {
|
||||
recordCount ++;
|
||||
return new WarcRecord(this);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
}
|
||||
|
||||
public void setLimit(int limit) {
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
public boolean isLimitReached() {
|
||||
return limit == 0;
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
if (limit == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (limit > 0) {
|
||||
limit--;
|
||||
}
|
||||
int c = is.read();
|
||||
//System.out.print((char) c);
|
||||
return c;
|
||||
}
|
||||
|
||||
public int getRecordCount() {
|
||||
return recordCount;
|
||||
}
|
||||
|
||||
class WarcException extends Exception {}
|
||||
class BufferOverflowException extends WarcException {}
|
||||
class BadMagicException extends WarcException {}
|
||||
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 17:29:35
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcRecord {
|
||||
|
||||
private WarcParser warcParser;
|
||||
private WarcRecordHeader header;
|
||||
private WarcRecordContent content;
|
||||
|
||||
public WarcRecord(WarcParser warcParser) {
|
||||
this.warcParser = warcParser;
|
||||
}
|
||||
|
||||
public Object getMagic() throws IOException, WarcParser.WarcException {
|
||||
return warcParser.getMagic();
|
||||
}
|
||||
|
||||
public WarcRecordHeader getHeader() {
|
||||
if (header == null) {
|
||||
header = new WarcRecordHeader(this);
|
||||
}
|
||||
return header;
|
||||
}
|
||||
|
||||
public String readLine() throws IOException, WarcParser.WarcException {
|
||||
return warcParser.readLine();
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return header.getType();
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
return header.getContentType();
|
||||
}
|
||||
|
||||
public WarcRecordContent getContent() {
|
||||
if (content == null) {
|
||||
warcParser.setLimit(getContentLength());
|
||||
content = new WarcRecordContent(this);
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
public int getContentLength() {
|
||||
return header.getContentLength();
|
||||
}
|
||||
|
||||
public boolean isLimitReached() {
|
||||
return warcParser.isLimitReached();
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
return warcParser.read();
|
||||
}
|
||||
|
||||
public void skipToEnd() throws IOException {
|
||||
getHeader();
|
||||
header.skipToEnd();
|
||||
getContent();
|
||||
content.skip(getContentLength());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
|
||||
package org.owark.warc;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 19:00:47
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcRecordContent extends InputStream implements Iterator<WarcField> {
|
||||
|
||||
private WarcRecord warcRecord;
|
||||
private Exception e;
|
||||
private String line;
|
||||
|
||||
public WarcRecordContent(WarcRecord warcRecord) {
|
||||
this.warcRecord = warcRecord;
|
||||
}
|
||||
|
||||
public boolean hasFields() {
|
||||
return warcRecord.getContentType().equals("application/warc-fields") || isHTTP();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
}
|
||||
return ! (warcRecord.isLimitReached() || line.equals(""));
|
||||
}
|
||||
|
||||
public WarcField next() {
|
||||
if (line == null) {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
}
|
||||
}
|
||||
if (line.equals("")) {
|
||||
line = null;
|
||||
return null;
|
||||
}
|
||||
WarcField field = new WarcField(line);
|
||||
line = null;
|
||||
return field;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
//To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return warcRecord.read();
|
||||
}
|
||||
|
||||
public boolean isHTTP() {
|
||||
return warcRecord.getContentType().startsWith("application/http");
|
||||
}
|
||||
|
||||
public boolean isRequest() {
|
||||
return warcRecord.getType().equals("request");
|
||||
}
|
||||
|
||||
public HttpStatusLine getStatusLine() throws IOException, WarcParser.WarcException {
|
||||
return new HttpStatusLine(warcRecord.readLine());
|
||||
}
|
||||
|
||||
public boolean hasStatusLine() {
|
||||
return isHTTP() && ! isRequest();
|
||||
}
|
||||
|
||||
public boolean hasRequestLine() {
|
||||
return isHTTP() && isRequest();
|
||||
}
|
||||
|
||||
public Object endOfContent() {
|
||||
return warcRecord.isLimitReached();
|
||||
}
|
||||
|
||||
public HttpRequestLine getRequestLine() throws IOException, WarcParser.WarcException {
|
||||
return new HttpRequestLine(warcRecord.readLine());
|
||||
}
|
||||
|
||||
public long getContentLength() {
|
||||
return warcRecord.getContentLength();
|
||||
}
|
||||
|
||||
|
||||
public class HttpStatusLine {
|
||||
|
||||
private String line;
|
||||
private String version;
|
||||
private String status;
|
||||
private String reason;
|
||||
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public String getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public String getReason() {
|
||||
return reason;
|
||||
}
|
||||
|
||||
|
||||
protected HttpStatusLine(String line) {
|
||||
this.line = line;
|
||||
String[] tokens = line.split(" ", 3);
|
||||
this.version = tokens[0];
|
||||
this.status = tokens[1];
|
||||
this.reason = tokens[2];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class HttpRequestLine {
|
||||
|
||||
private String line;
|
||||
private String version;
|
||||
private String method;
|
||||
private String uri;
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public String getMethod() {
|
||||
return method;
|
||||
}
|
||||
|
||||
public String getUri() {
|
||||
return uri;
|
||||
}
|
||||
|
||||
public HttpRequestLine(String line) {
|
||||
this.line = line;
|
||||
String[] tokens = line.split(" ", 3);
|
||||
this.method = tokens[0];
|
||||
this.uri = tokens[1];
|
||||
this.version = tokens[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import java.util.Hashtable;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 17:50:01
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcRecordHeader implements Iterator<WarcField> {
|
||||
|
||||
|
||||
public static String WARC_TYPE = "WARC-Type";
|
||||
public static String CONTENT_TYPE = "Content-Type";
|
||||
public static String CONTENT_LENGTH = "Content-Length";
|
||||
|
||||
private WarcRecord warcRecord;
|
||||
private String line;
|
||||
private Exception e;
|
||||
private Map<String,String> headers;
|
||||
private boolean endOfHeader = false;
|
||||
|
||||
|
||||
public WarcRecordHeader(WarcRecord warcRecord) {
|
||||
this.warcRecord = warcRecord;
|
||||
headers = new Hashtable<String, String>();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (endOfHeader) {
|
||||
return false;
|
||||
}
|
||||
if (line == null) {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (line.equals("")) {
|
||||
endOfHeader = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public WarcField next() {
|
||||
if (endOfHeader) {
|
||||
return null;
|
||||
}
|
||||
if (line == null) {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
WarcField item = new WarcField(line);
|
||||
line = null;
|
||||
headers.put(item.getKey(), item.getValue());
|
||||
return item;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return headers.get(WARC_TYPE);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
return headers.get(CONTENT_TYPE);
|
||||
}
|
||||
|
||||
public int getContentLength() {
|
||||
return Integer.parseInt(headers.get(CONTENT_LENGTH));
|
||||
}
|
||||
|
||||
public void skipToEnd() {
|
||||
while (hasNext()) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,297 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Test cases for WarcParser
|
||||
*/
|
||||
public class WarcParserTest {
|
||||
|
||||
private static WarcParser warcParser;
|
||||
|
||||
@Test
|
||||
public void testDyomedea() throws IOException, WarcParser.WarcException {
|
||||
|
||||
// WARC
|
||||
|
||||
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
|
||||
WarcParser warcParser = new WarcParser(new FileInputStream(file));
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
|
||||
// RECORD
|
||||
|
||||
WarcRecord record = warcParser.next();
|
||||
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
|
||||
Assert.assertNotNull(record);
|
||||
Assert.assertEquals("WARC/1.0", record.getMagic());
|
||||
|
||||
// HEADER
|
||||
|
||||
WarcRecordHeader header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
WarcField headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("warcinfo", headerItem.getValue());
|
||||
Assert.assertEquals("warcinfo", header.getType());
|
||||
Assert.assertEquals("warcinfo", record.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals("WARC-Date", headerItem.getKey());
|
||||
Assert.assertEquals("2012-04-23T10:05:24Z", headerItem.getValue());
|
||||
headerItem = header.next();
|
||||
headerItem = header.next();
|
||||
headerItem = header.next();
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals("Content-Length", headerItem.getKey());
|
||||
Assert.assertEquals("369", headerItem.getValue());
|
||||
Assert.assertEquals(false, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNull(headerItem);
|
||||
Assert.assertEquals("application/warc-fields", record.getContentType());
|
||||
Assert.assertEquals(369, record.getContentLength());
|
||||
|
||||
// Content
|
||||
|
||||
WarcRecordContent content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(true, content.hasFields());
|
||||
Assert.assertEquals(false, content.isHTTP());
|
||||
Assert.assertEquals(false, content.hasStatusLine());
|
||||
Assert.assertEquals(false, content.hasRequestLine());
|
||||
Assert.assertEquals(true, content.hasNext());
|
||||
WarcField field = content.next();
|
||||
Assert.assertEquals(false, content.endOfContent());
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("software", field.getKey());
|
||||
Assert.assertEquals("Heritrix/3.1.0 http://crawler.archive.org", field.getValue());
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("http-header-user-agent", field.getKey());
|
||||
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
|
||||
Assert.assertEquals(false, content.hasNext());
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
|
||||
// Next record
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("response", headerItem.getValue());
|
||||
|
||||
header.skipToEnd();
|
||||
|
||||
// Content
|
||||
|
||||
content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(false, content.hasFields());
|
||||
Assert.assertEquals(false, content.isHTTP());
|
||||
Assert.assertEquals(false, content.hasStatusLine());
|
||||
Assert.assertEquals(false, content.hasRequestLine());
|
||||
Assert.assertEquals(false, content.endOfContent());
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
|
||||
String line = reader.readLine();
|
||||
Assert.assertEquals("20120423100524", line);
|
||||
line = reader.readLine();
|
||||
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
|
||||
line = reader.readLine();
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
Assert.assertNull(line);
|
||||
|
||||
// Next record
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("response", headerItem.getValue());
|
||||
|
||||
header.skipToEnd();
|
||||
|
||||
// Content
|
||||
|
||||
content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(true, content.hasFields());
|
||||
Assert.assertEquals(true, content.isHTTP());
|
||||
Assert.assertEquals(false, content.isRequest());
|
||||
Assert.assertEquals(true, content.hasStatusLine());
|
||||
Assert.assertEquals(false, content.hasRequestLine());
|
||||
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
|
||||
Assert.assertNotNull(status);
|
||||
Assert.assertEquals("HTTP/1.1 404 Introuvable", status.getLine());
|
||||
Assert.assertEquals("HTTP/1.1", status.getVersion());
|
||||
Assert.assertEquals("404", status.getStatus());
|
||||
Assert.assertEquals("Introuvable", status.getReason());
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("Date", field.getKey());
|
||||
Assert.assertEquals("Mon, 23 Apr 2012 10:05:27 GMT", field.getValue());
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("Connection", field.getKey());
|
||||
Assert.assertEquals("close", field.getValue());
|
||||
Assert.assertEquals(false, content.hasNext());
|
||||
Assert.assertEquals(false, content.endOfContent());
|
||||
reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
|
||||
line = reader.readLine();
|
||||
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
|
||||
line = reader.readLine();
|
||||
Assert.assertNull(line);
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
|
||||
|
||||
// Next record
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("request", headerItem.getValue());
|
||||
|
||||
header.skipToEnd();
|
||||
|
||||
// Content
|
||||
|
||||
content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(true, content.hasFields());
|
||||
Assert.assertEquals(true, content.isHTTP());
|
||||
Assert.assertEquals(true, content.isRequest());
|
||||
Assert.assertEquals(false, content.hasStatusLine());
|
||||
Assert.assertEquals(true, content.hasRequestLine());
|
||||
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
|
||||
Assert.assertEquals("GET /robots.txt HTTP/1.0", request.getLine());
|
||||
Assert.assertEquals("GET", request.getMethod());
|
||||
Assert.assertEquals("/robots.txt", request.getUri());
|
||||
Assert.assertEquals("HTTP/1.0", request.getVersion());
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("User-Agent", field.getKey());
|
||||
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("Host", field.getKey());
|
||||
Assert.assertEquals("dyomedea.com", field.getValue());
|
||||
Assert.assertEquals(false, content.hasNext());
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
|
||||
|
||||
// Skip record
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
record.skipToEnd();
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("response", headerItem.getValue());
|
||||
record.skipToEnd();
|
||||
|
||||
// Go to last record
|
||||
|
||||
while (warcParser.hasNext()) {
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
record.skipToEnd();
|
||||
}
|
||||
|
||||
Assert.assertEquals(69, warcParser.getRecordCount());
|
||||
Assert.assertEquals("metadata", record.getType());
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void skipToEnd() throws IOException, WarcParser.WarcException {
|
||||
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
|
||||
WarcParser warcParser = new WarcParser(new FileInputStream(file));
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
WarcRecord record = warcParser.next();
|
||||
WarcRecordHeader header = record.getHeader();
|
||||
while (header.hasNext()) {
|
||||
Assert.assertNotNull(header.next());
|
||||
}
|
||||
WarcRecordContent content = record.getContent();
|
||||
while (content.hasNext()) {
|
||||
Assert.assertNotNull(content.next());
|
||||
}
|
||||
record.skipToEnd();
|
||||
|
||||
|
||||
}
|
||||
|
||||
} ;
|
File diff suppressed because it is too large
Load Diff
|
@ -7,7 +7,7 @@
|
|||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
|
||||
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary" xmlns:owk="http://owark.org/orbeon/processors">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
|
@ -33,6 +33,15 @@
|
|||
</p:input>
|
||||
<p:output name="data" id="warc"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="owk:from-warc-converter">
|
||||
<p:input name="data" href="#warc"/>
|
||||
<p:output name="data" id="warc-xml" debug="warc-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#warc-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Store it in a temp file -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
<processors xmlns:owk="http://owark.org/orbeon/processors">
|
||||
<processor name="owk:from-warc-converter">
|
||||
<class name="org.owark.orbeon.FromWarcConverter"/>
|
||||
</processor>
|
||||
</processors>
|
||||
|
Loading…
Reference in New Issue