Implementing yet another WARC parser (the heritrix one didn't work well with Orbeon due to http client library conflicts).

This commit is contained in:
Eric van der Vlist 2012-04-26 09:48:43 +02:00
parent 307b6d2a72
commit be1a361ab9
11 changed files with 3974 additions and 1 deletions

45
archiver/build.xml Normal file
View File

@ -0,0 +1,45 @@
<project name="owark" default="dist" basedir=".">
<description>
Owark build file
</description>
<!-- set global properties for this build -->
<property name="src" location="java/src"/>
<property name="build" location="build"/>
<property name="dist" location="dist"/>
<target name="init">
<!-- Create the time stamp -->
<tstamp/>
<!-- Create the build directory structure used by compile -->
<mkdir dir="${build}"/>
</target>
<target name="compile" depends="init"
description="compile the source " >
<!-- Compile the java code from ${src} into ${build} -->
<javac srcdir="${src}" destdir="${build}">
<classpath>
<pathelement location="java/lib/heritrix-commons-3.1.0.jar"/>
<pathelement location="java/lib/archive-overlay-commons-httpclient-3.1.jar"/>
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/commons-fileupload-1.2.2.jar"/>
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/orbeon.jar"/>
</classpath>
</javac>
</target>
<target name="dist" depends="compile"
description="generate the distribution" >
<!-- Create the distribution directory -->
<mkdir dir="${dist}/lib"/>
<!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
<jar jarfile="${dist}/lib/owark.jar" basedir="${build}"/>
</target>
<target name="clean"
description="clean up" >
<!-- Delete the ${build} and ${dist} directory trees -->
<delete dir="${build}"/>
<delete dir="${dist}"/>
</target>
</project>

View File

@ -0,0 +1,117 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.orbeon;
import org.apache.commons.fileupload.FileItem;
import org.orbeon.oxf.pipeline.api.PipelineContext;
import org.orbeon.oxf.pipeline.api.XMLReceiver;
import org.orbeon.oxf.processor.ProcessorImpl;
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
import org.orbeon.oxf.processor.ProcessorOutput;
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
import org.orbeon.oxf.util.NetUtils;
import org.orbeon.oxf.xml.ContentHandlerHelper;
import org.owark.warc.*;
import java.io.IOException;
/**
* This processor converts a WARC archive into an XML representation
*/
public class FromWarcConverter extends ProcessorImpl {
static public String WARC_ELEMENT_ROOT_NAME = "warc";
static public String RECORD_ELEMENT_NAME = "record";
static public String HEADERS_ELEMENT_NAME = "headers";
static public String HEADER_ELEMENT_NAME = "header";
static public String NAME_ATTRIBUTE_NAME = "name";
static public String CONTENT_ELEMENT_NAME = "content";
public FromWarcConverter() {
addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA));
addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA));
}
@Override
public ProcessorOutput createOutput(String outputName) {
final ProcessorOutput output = new ProcessorOutputImpl(FromWarcConverter.this,outputName) {
@Override
protected void readImpl(PipelineContext pipelineContext, XMLReceiver xmlReceiver) {
// Get FileItem
try {
ContentHandlerHelper helper = new ContentHandlerHelper(xmlReceiver);
helper.startDocument();
helper.startElement(WARC_ELEMENT_ROOT_NAME);
final FileItem fileItem = NetUtils.prepareFileItem(NetUtils.REQUEST_SCOPE);
// Read to OutputStream
readInputAsSAX(pipelineContext, INPUT_DATA, new BinaryTextXMLReceiver(null, fileItem.getOutputStream(), true, false, null, false, false, null, false));
// as an archive
final WarcParser warcParser = new WarcParser(fileItem.getInputStream());
while (warcParser.hasNext()) {
helper.startElement(RECORD_ELEMENT_NAME);
helper.startElement(HEADERS_ELEMENT_NAME);
WarcRecord record = warcParser.next();
WarcRecordHeader recordHeader = record.getHeader();
while (recordHeader.hasNext()) {
WarcField field = recordHeader.next();
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
helper.text(field.getValue());
helper.endElement();
}
helper.endElement();
helper.startElement(CONTENT_ELEMENT_NAME);
WarcRecordContent content = record.getContent();
if (content.hasRequestLine()) {
helper.startElement("request");
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
helper.element("method", request.getMethod());
helper.element("uri", request.getUri());
helper.element("version", request.getVersion());
helper.endElement();
} else if (content.hasStatusLine()) {
helper.startElement("status");
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
helper.element("version", status.getVersion());
helper.element("status", status.getStatus());
helper.element("reason", status.getReason());
helper.endElement();
}
if (content.hasFields()) {
helper.startElement(HEADERS_ELEMENT_NAME);
while (content.hasNext()) {
WarcField field = content.next();
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
helper.text(field.getValue());
helper.endElement();
}
helper.endElement();
}
record.skipToEnd();
helper.endElement();
helper.endElement();
}
helper.endElement();
helper.endDocument();
} catch (Exception e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
};
addOutput(outputName, output);
return output;
}
}

View File

@ -0,0 +1,49 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 17:56:22
* To change this template use File | Settings | File Templates.
*/
public class WarcField {
private String line;
private String key;
private String value;
public WarcField(String line) {
this.line = line;
int sep = line.indexOf(":");
this.key = line.substring(0, sep).trim();
this.value = line.substring(sep + 1).trim();
}
public String getKey() {
return key;
}
public String getLine() {
return line;
}
public String getValue() {
return value;
}
}

View File

@ -0,0 +1,123 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
/**
* Read WARC files
*/
public class WarcParser implements Iterator<WarcRecord> {
public static int BUFFER_SIZE = 1024;
public static String CRLF = "\r\n";
public static String CRLFCRLF = CRLF + CRLF;
public static String MAGIC = "WARC/";
private InputStream is;
private byte[] buffer = new byte[BUFFER_SIZE];
private int index = 0;
private int limit = -1;
private String magic;
private int recordCount;
public WarcParser(InputStream is) {
this.is = is;
resetBuffer();
}
public String getMagic() throws IOException, WarcException {
return this.magic;
}
private void resetBuffer() {
index = 0;
}
private void readUntil(String stringPattern) throws IOException, WarcException {
boolean matches = true;
for (int i=0; i< stringPattern.length() && limit != 0; i++) {
int c = read();
buffer[index ++] = (byte) c;
if (stringPattern.codePointAt(i) != c) {
matches = false;
break;
}
}
if (matches) {
return;
}
readUntil(stringPattern);
}
protected String readLine() throws IOException, WarcException {
readUntil(CRLF);
String line = new String(buffer, 0, index - CRLF.length(), "UTF-8");
resetBuffer();
return line;
}
public boolean hasNext() {
limit = -1;
do {
try {
magic = readLine();
} catch (Exception e) {
return false;
}
} while (! magic.startsWith(MAGIC));
return true;
}
public WarcRecord next() {
recordCount ++;
return new WarcRecord(this);
}
public void remove() {
}
public void setLimit(int limit) {
this.limit = limit;
}
public boolean isLimitReached() {
return limit == 0;
}
public int read() throws IOException {
if (limit == 0) {
return -1;
}
if (limit > 0) {
limit--;
}
int c = is.read();
//System.out.print((char) c);
return c;
}
public int getRecordCount() {
return recordCount;
}
class WarcException extends Exception {}
class BufferOverflowException extends WarcException {}
class BadMagicException extends WarcException {}
}

View File

@ -0,0 +1,84 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.io.IOException;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 17:29:35
* To change this template use File | Settings | File Templates.
*/
public class WarcRecord {
private WarcParser warcParser;
private WarcRecordHeader header;
private WarcRecordContent content;
public WarcRecord(WarcParser warcParser) {
this.warcParser = warcParser;
}
public Object getMagic() throws IOException, WarcParser.WarcException {
return warcParser.getMagic();
}
public WarcRecordHeader getHeader() {
if (header == null) {
header = new WarcRecordHeader(this);
}
return header;
}
public String readLine() throws IOException, WarcParser.WarcException {
return warcParser.readLine();
}
public String getType() {
return header.getType();
}
public String getContentType() {
return header.getContentType();
}
public WarcRecordContent getContent() {
if (content == null) {
warcParser.setLimit(getContentLength());
content = new WarcRecordContent(this);
}
return content;
}
public int getContentLength() {
return header.getContentLength();
}
public boolean isLimitReached() {
return warcParser.isLimitReached();
}
public int read() throws IOException {
return warcParser.read();
}
public void skipToEnd() throws IOException {
getHeader();
header.skipToEnd();
getContent();
content.skip(getContentLength());
}
}

View File

@ -0,0 +1,177 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 19:00:47
* To change this template use File | Settings | File Templates.
*/
public class WarcRecordContent extends InputStream implements Iterator<WarcField> {
private WarcRecord warcRecord;
private Exception e;
private String line;
public WarcRecordContent(WarcRecord warcRecord) {
this.warcRecord = warcRecord;
}
public boolean hasFields() {
return warcRecord.getContentType().equals("application/warc-fields") || isHTTP();
}
public boolean hasNext() {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
}
return ! (warcRecord.isLimitReached() || line.equals(""));
}
public WarcField next() {
if (line == null) {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
}
}
if (line.equals("")) {
line = null;
return null;
}
WarcField field = new WarcField(line);
line = null;
return field;
}
public void remove() {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public int read() throws IOException {
return warcRecord.read();
}
public boolean isHTTP() {
return warcRecord.getContentType().startsWith("application/http");
}
public boolean isRequest() {
return warcRecord.getType().equals("request");
}
public HttpStatusLine getStatusLine() throws IOException, WarcParser.WarcException {
return new HttpStatusLine(warcRecord.readLine());
}
public boolean hasStatusLine() {
return isHTTP() && ! isRequest();
}
public boolean hasRequestLine() {
return isHTTP() && isRequest();
}
public Object endOfContent() {
return warcRecord.isLimitReached();
}
public HttpRequestLine getRequestLine() throws IOException, WarcParser.WarcException {
return new HttpRequestLine(warcRecord.readLine());
}
public long getContentLength() {
return warcRecord.getContentLength();
}
public class HttpStatusLine {
private String line;
private String version;
private String status;
private String reason;
public String getLine() {
return line;
}
public String getVersion() {
return version;
}
public String getStatus() {
return status;
}
public String getReason() {
return reason;
}
protected HttpStatusLine(String line) {
this.line = line;
String[] tokens = line.split(" ", 3);
this.version = tokens[0];
this.status = tokens[1];
this.reason = tokens[2];
}
}
public class HttpRequestLine {
private String line;
private String version;
private String method;
private String uri;
public String getLine() {
return line;
}
public String getVersion() {
return version;
}
public String getMethod() {
return method;
}
public String getUri() {
return uri;
}
public HttpRequestLine(String line) {
this.line = line;
String[] tokens = line.split(" ", 3);
this.method = tokens[0];
this.uri = tokens[1];
this.version = tokens[2];
}
}
}

View File

@ -0,0 +1,103 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 17:50:01
* To change this template use File | Settings | File Templates.
*/
public class WarcRecordHeader implements Iterator<WarcField> {
public static String WARC_TYPE = "WARC-Type";
public static String CONTENT_TYPE = "Content-Type";
public static String CONTENT_LENGTH = "Content-Length";
private WarcRecord warcRecord;
private String line;
private Exception e;
private Map<String,String> headers;
private boolean endOfHeader = false;
public WarcRecordHeader(WarcRecord warcRecord) {
this.warcRecord = warcRecord;
headers = new Hashtable<String, String>();
}
public boolean hasNext() {
if (endOfHeader) {
return false;
}
if (line == null) {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
return false;
}
}
if (line.equals("")) {
endOfHeader = true;
return false;
}
return true;
}
public WarcField next() {
if (endOfHeader) {
return null;
}
if (line == null) {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
return null;
}
}
WarcField item = new WarcField(line);
line = null;
headers.put(item.getKey(), item.getValue());
return item;
}
public String getType() {
return headers.get(WARC_TYPE);
}
public void remove() {
}
public String getContentType() {
return headers.get(CONTENT_TYPE);
}
public int getContentLength() {
return Integer.parseInt(headers.get(CONTENT_LENGTH));
}
public void skipToEnd() {
while (hasNext()) {
next();
}
}
}

View File

@ -0,0 +1,297 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import org.junit.Assert;
import org.junit.Test;
import java.io.*;
/**
* Test cases for WarcParser
*/
public class WarcParserTest {
private static WarcParser warcParser;
@Test
public void testDyomedea() throws IOException, WarcParser.WarcException {
// WARC
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
WarcParser warcParser = new WarcParser(new FileInputStream(file));
Assert.assertEquals(true, warcParser.hasNext());
// RECORD
WarcRecord record = warcParser.next();
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
Assert.assertNotNull(record);
Assert.assertEquals("WARC/1.0", record.getMagic());
// HEADER
WarcRecordHeader header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
WarcField headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("warcinfo", headerItem.getValue());
Assert.assertEquals("warcinfo", header.getType());
Assert.assertEquals("warcinfo", record.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals("WARC-Date", headerItem.getKey());
Assert.assertEquals("2012-04-23T10:05:24Z", headerItem.getValue());
headerItem = header.next();
headerItem = header.next();
headerItem = header.next();
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals("Content-Length", headerItem.getKey());
Assert.assertEquals("369", headerItem.getValue());
Assert.assertEquals(false, header.hasNext());
headerItem = header.next();
Assert.assertNull(headerItem);
Assert.assertEquals("application/warc-fields", record.getContentType());
Assert.assertEquals(369, record.getContentLength());
// Content
WarcRecordContent content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(true, content.hasFields());
Assert.assertEquals(false, content.isHTTP());
Assert.assertEquals(false, content.hasStatusLine());
Assert.assertEquals(false, content.hasRequestLine());
Assert.assertEquals(true, content.hasNext());
WarcField field = content.next();
Assert.assertEquals(false, content.endOfContent());
Assert.assertNotNull(field);
Assert.assertEquals("software", field.getKey());
Assert.assertEquals("Heritrix/3.1.0 http://crawler.archive.org", field.getValue());
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("http-header-user-agent", field.getKey());
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertEquals(true, content.endOfContent());
// Next record
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("response", headerItem.getValue());
header.skipToEnd();
// Content
content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(false, content.hasFields());
Assert.assertEquals(false, content.isHTTP());
Assert.assertEquals(false, content.hasStatusLine());
Assert.assertEquals(false, content.hasRequestLine());
Assert.assertEquals(false, content.endOfContent());
BufferedReader reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
String line = reader.readLine();
Assert.assertEquals("20120423100524", line);
line = reader.readLine();
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
line = reader.readLine();
Assert.assertEquals(true, content.endOfContent());
Assert.assertNull(line);
// Next record
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("response", headerItem.getValue());
header.skipToEnd();
// Content
content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(true, content.hasFields());
Assert.assertEquals(true, content.isHTTP());
Assert.assertEquals(false, content.isRequest());
Assert.assertEquals(true, content.hasStatusLine());
Assert.assertEquals(false, content.hasRequestLine());
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
Assert.assertNotNull(status);
Assert.assertEquals("HTTP/1.1 404 Introuvable", status.getLine());
Assert.assertEquals("HTTP/1.1", status.getVersion());
Assert.assertEquals("404", status.getStatus());
Assert.assertEquals("Introuvable", status.getReason());
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("Date", field.getKey());
Assert.assertEquals("Mon, 23 Apr 2012 10:05:27 GMT", field.getValue());
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("Connection", field.getKey());
Assert.assertEquals("close", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertEquals(false, content.endOfContent());
reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
line = reader.readLine();
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
line = reader.readLine();
Assert.assertNull(line);
Assert.assertEquals(true, content.endOfContent());
// Next record
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("request", headerItem.getValue());
header.skipToEnd();
// Content
content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(true, content.hasFields());
Assert.assertEquals(true, content.isHTTP());
Assert.assertEquals(true, content.isRequest());
Assert.assertEquals(false, content.hasStatusLine());
Assert.assertEquals(true, content.hasRequestLine());
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
Assert.assertEquals("GET /robots.txt HTTP/1.0", request.getLine());
Assert.assertEquals("GET", request.getMethod());
Assert.assertEquals("/robots.txt", request.getUri());
Assert.assertEquals("HTTP/1.0", request.getVersion());
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("User-Agent", field.getKey());
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
field = content.next();
field = content.next();
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("Host", field.getKey());
Assert.assertEquals("dyomedea.com", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertEquals(true, content.endOfContent());
// Skip record
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
record.skipToEnd();
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("response", headerItem.getValue());
record.skipToEnd();
// Go to last record
while (warcParser.hasNext()) {
record = warcParser.next();
Assert.assertNotNull(record);
record.skipToEnd();
}
Assert.assertEquals(69, warcParser.getRecordCount());
Assert.assertEquals("metadata", record.getType());
}
@Test
public void skipToEnd() throws IOException, WarcParser.WarcException {
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
WarcParser warcParser = new WarcParser(new FileInputStream(file));
Assert.assertEquals(true, warcParser.hasNext());
WarcRecord record = warcParser.next();
WarcRecordHeader header = record.getHeader();
while (header.hasNext()) {
Assert.assertNotNull(header.next());
}
WarcRecordContent content = record.getContent();
while (content.hasNext()) {
Assert.assertNotNull(content.next());
}
record.skipToEnd();
}
} ;

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary" xmlns:owk="http://owark.org/orbeon/processors">
<p:param name="data" type="input"/>
@ -33,6 +33,15 @@
</p:input>
<p:output name="data" id="warc"/>
</p:processor>
<p:processor name="owk:from-warc-converter">
<p:input name="data" href="#warc"/>
<p:output name="data" id="warc-xml" debug="warc-xml"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#warc-xml"/>
</p:processor>
<!-- Store it in a temp file -->
<p:processor name="oxf:file-serializer">

View File

@ -0,0 +1,6 @@
<processors xmlns:owk="http://owark.org/orbeon/processors">
<processor name="owk:from-warc-converter">
<class name="org.owark.orbeon.FromWarcConverter"/>
</processor>
</processors>