Deleting what doesn't belong to Wordpress

This commit is contained in:
Eric van der Vlist 2020-05-01 12:28:16 +02:00
parent 10c0d87b93
commit 21807536ca
42 changed files with 0 additions and 13839 deletions

View File

@ -1,45 +0,0 @@
<project name="owark" default="dist" basedir=".">
<description>
Owark build file
</description>
<!-- set global properties for this build -->
<property name="src" location="java/src"/>
<property name="build" location="build"/>
<property name="dist" location="dist"/>
<target name="init">
<!-- Create the time stamp -->
<tstamp/>
<!-- Create the build directory structure used by compile -->
<mkdir dir="${build}"/>
</target>
<target name="compile" depends="init"
description="compile the source " >
<!-- Compile the java code from ${src} into ${build} -->
<javac srcdir="${src}" destdir="${build}">
<classpath>
<pathelement location="java/lib/heritrix-commons-3.1.0.jar"/>
<pathelement location="java/lib/archive-overlay-commons-httpclient-3.1.jar"/>
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/commons-fileupload-1.2.2.jar"/>
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/orbeon.jar"/>
</classpath>
</javac>
</target>
<target name="dist" depends="compile"
description="generate the distribution" >
<!-- Create the distribution directory -->
<mkdir dir="${dist}/lib"/>
<!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
<jar jarfile="${dist}/lib/owark.jar" basedir="${build}"/>
</target>
<target name="clean"
description="clean up" >
<!-- Delete the ${build} and ${dist} directory trees -->
<delete dir="${build}"/>
<delete dir="${dist}"/>
</target>
</project>

View File

@ -1,146 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.orbeon;
import org.apache.commons.fileupload.FileItem;
import org.orbeon.oxf.pipeline.api.PipelineContext;
import org.orbeon.oxf.pipeline.api.XMLReceiver;
import org.orbeon.oxf.processor.ProcessorImpl;
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
import org.orbeon.oxf.processor.ProcessorOutput;
import org.orbeon.oxf.processor.ProcessorUtils;
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
import org.orbeon.oxf.util.NetUtils;
import org.orbeon.oxf.xml.ContentHandlerHelper;
import org.orbeon.oxf.xml.XMLConstants;
import org.orbeon.oxf.xml.XMLUtils;
import org.owark.warc.*;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.AttributesImpl;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
/**
* This processor converts a WARC archive into an XML representation
*/
public class FromWarcConverter extends ProcessorImpl {
static public String WARC_ELEMENT_ROOT_NAME = "warc";
static public String RECORD_ELEMENT_NAME = "record";
static public String HEADERS_ELEMENT_NAME = "headers";
static public String HEADER_ELEMENT_NAME = "header";
static public String NAME_ATTRIBUTE_NAME = "name";
static public String CONTENT_ELEMENT_NAME = "content";
public FromWarcConverter() {
addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA));
addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA));
}
@Override
public ProcessorOutput createOutput(String outputName) {
final ProcessorOutput output = new ProcessorOutputImpl(FromWarcConverter.this,outputName) {
@Override
protected void readImpl(PipelineContext pipelineContext, XMLReceiver xmlReceiver) {
// Get FileItem
try {
ContentHandlerHelper helper = new ContentHandlerHelper(xmlReceiver);
helper.startDocument();
helper.startElement(WARC_ELEMENT_ROOT_NAME);
final FileItem fileItem = NetUtils.prepareFileItem(NetUtils.REQUEST_SCOPE);
// Read to OutputStream
readInputAsSAX(pipelineContext, INPUT_DATA, new BinaryTextXMLReceiver(null, fileItem.getOutputStream(), true, false, null, false, false, null, false));
// as an archive
final WarcParser warcParser = new WarcParser(fileItem.getInputStream());
while (warcParser.hasNext()) {
helper.startElement(RECORD_ELEMENT_NAME);
helper.startElement(HEADERS_ELEMENT_NAME);
WarcRecord record = warcParser.next();
WarcRecordHeader recordHeader = record.getHeader();
while (recordHeader.hasNext()) {
WarcField field = recordHeader.next();
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
helper.text(field.getValue());
helper.endElement();
}
helper.endElement();
helper.startElement(CONTENT_ELEMENT_NAME);
WarcRecordContent content = record.getContent();
if (content.hasRequestLine()) {
helper.startElement("request");
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
helper.element("method", request.getMethod());
helper.element("uri", request.getUri());
helper.element("version", request.getVersion());
helper.endElement();
} else if (content.hasStatusLine()) {
helper.startElement("status");
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
helper.element("version", status.getVersion());
helper.element("status", status.getStatus());
helper.element("reason", status.getReason());
helper.endElement();
}
if (content.hasFields()) {
helper.startElement(HEADERS_ELEMENT_NAME);
while (content.hasNext()) {
WarcField field = content.next();
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
helper.text(field.getValue());
helper.endElement();
}
helper.endElement();
}
if (! content.endOfContent()) {
helper.startPrefixMapping("xsi", "http://www.w3.org/2001/XMLSchema-instance");
helper.startPrefixMapping("xs", "http://www.w3.org/2001/XMLSchema");
String contentType = content.getPayloadContentType();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "content-type", "content-type", "CDATA", contentType);
if (contentType.startsWith("text/") || contentType.matches(".*application/[^;]*xml.*")) {
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:string");
String encoding = content.getPayloadEncoding();
if (encoding == null) {
encoding = "utf-8";
}
helper.startElement(ProcessorUtils.DEFAULT_TEXT_DOCUMENT_ELEMENT, attributes);
XMLUtils.readerToCharacters(new InputStreamReader(content, encoding), xmlReceiver);
helper.endElement();
} else {
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:base64Binary");
helper.startElement(ProcessorUtils.DEFAULT_BINARY_DOCUMENT_ELEMENT, attributes);
XMLUtils.inputStreamToBase64Characters(new BufferedInputStream(content), xmlReceiver);
helper.endElement();
}
}
record.skipToEnd();
helper.endElement();
helper.endElement();
}
helper.endElement();
helper.endDocument();
} catch (Exception e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
};
addOutput(outputName, output);
return output;
}
}

View File

@ -1,49 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 17:56:22
* To change this template use File | Settings | File Templates.
*/
public class WarcField {
private String line;
private String key;
private String value;
public WarcField(String line) {
this.line = line;
int sep = line.indexOf(":");
this.key = line.substring(0, sep).trim();
this.value = line.substring(sep + 1).trim();
}
public String getKey() {
return key;
}
public String getLine() {
return line;
}
public String getValue() {
return value;
}
}

View File

@ -1,123 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
/**
* Read WARC files
*/
public class WarcParser implements Iterator<WarcRecord> {
public static int BUFFER_SIZE = 1024;
public static String CRLF = "\r\n";
public static String CRLFCRLF = CRLF + CRLF;
public static String MAGIC = "WARC/";
private InputStream is;
private byte[] buffer = new byte[BUFFER_SIZE];
private int index = 0;
private int limit = -1;
private String magic;
private int recordCount;
public WarcParser(InputStream is) {
this.is = is;
resetBuffer();
}
public String getMagic() throws IOException, WarcException {
return this.magic;
}
private void resetBuffer() {
index = 0;
}
private void readUntil(String stringPattern) throws IOException, WarcException {
boolean matches = true;
for (int i=0; i< stringPattern.length() && limit != 0; i++) {
int c = read();
buffer[index ++] = (byte) c;
if (stringPattern.codePointAt(i) != c) {
matches = false;
break;
}
}
if (matches) {
return;
}
readUntil(stringPattern);
}
protected String readLine() throws IOException, WarcException {
readUntil(CRLF);
String line = new String(buffer, 0, index - CRLF.length(), "UTF-8");
resetBuffer();
return line;
}
public boolean hasNext() {
limit = -1;
do {
try {
magic = readLine();
} catch (Exception e) {
return false;
}
} while (! magic.startsWith(MAGIC));
return true;
}
public WarcRecord next() {
recordCount ++;
return new WarcRecord(this);
}
public void remove() {
}
public void setLimit(int limit) {
this.limit = limit;
}
public boolean isLimitReached() {
return limit == 0;
}
public int read() throws IOException {
if (limit == 0) {
return -1;
}
if (limit > 0) {
limit--;
}
int c = is.read();
//System.out.print((char) c);
return c;
}
public int getRecordCount() {
return recordCount;
}
class WarcException extends Exception {}
class BufferOverflowException extends WarcException {}
class BadMagicException extends WarcException {}
}

View File

@ -1,84 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.io.IOException;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 17:29:35
* To change this template use File | Settings | File Templates.
*/
public class WarcRecord {
private WarcParser warcParser;
private WarcRecordHeader header;
private WarcRecordContent content;
public WarcRecord(WarcParser warcParser) {
this.warcParser = warcParser;
}
public Object getMagic() throws IOException, WarcParser.WarcException {
return warcParser.getMagic();
}
public WarcRecordHeader getHeader() {
if (header == null) {
header = new WarcRecordHeader(this);
}
return header;
}
public String readLine() throws IOException, WarcParser.WarcException {
return warcParser.readLine();
}
public String getType() {
return header.getType();
}
public String getContentType() {
return header.getContentType();
}
public WarcRecordContent getContent() {
if (content == null) {
warcParser.setLimit(getContentLength());
content = new WarcRecordContent(this);
}
return content;
}
public int getContentLength() {
return header.getContentLength();
}
public boolean isLimitReached() {
return warcParser.isLimitReached();
}
public int read() throws IOException {
return warcParser.read();
}
public void skipToEnd() throws IOException {
getHeader();
header.skipToEnd();
getContent();
content.skip(getContentLength());
}
}

View File

@ -1,215 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 19:00:47
* To change this template use File | Settings | File Templates.
*/
public class WarcRecordContent extends InputStream implements Iterator<WarcField> {
private WarcRecord warcRecord;
private Exception e;
private String line;
private String payloadContentType;
public WarcRecordContent(WarcRecord warcRecord) {
this.warcRecord = warcRecord;
}
public boolean hasFields() {
return warcRecord.getContentType().equals("application/warc-fields") || isHTTP();
}
public boolean hasNext() {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
}
return ! (warcRecord.isLimitReached() || line.equals(""));
}
public WarcField next() {
if (line == null) {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
}
}
if (line.equals("")) {
line = null;
return null;
}
WarcField field = new WarcField(line);
if (field.getKey().equals("Content-Type")) {
this.payloadContentType = field.getValue();
}
line = null;
return field;
}
public void remove() {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public int read() throws IOException {
return warcRecord.read();
}
public boolean isHTTP() {
return warcRecord.getContentType().startsWith("application/http");
}
public boolean isRequest() {
return warcRecord.getType().equals("request");
}
public HttpStatusLine getStatusLine() throws IOException, WarcParser.WarcException {
return new HttpStatusLine(warcRecord.readLine());
}
public boolean hasStatusLine() {
return isHTTP() && ! isRequest();
}
public boolean hasRequestLine() {
return isHTTP() && isRequest();
}
public boolean endOfContent() {
return warcRecord.isLimitReached();
}
public HttpRequestLine getRequestLine() throws IOException, WarcParser.WarcException {
return new HttpRequestLine(warcRecord.readLine());
}
public long getContentLength() {
return warcRecord.getContentLength();
}
public String getPayloadContentType() {
String contentType = getPayloadContentHeader();
if (contentType != null && contentType.contains(";")) {
contentType = contentType.substring(0, contentType.indexOf(";"));
}
return contentType;
}
public String getPayloadContentHeader() {
String contentType = warcRecord.getContentType();
if (contentType.equals("application/warc-fields") || contentType.equals("application/http; msgtype=request")) {
return null;
}
if (contentType.equals("application/http; msgtype=response")) {
contentType = this.payloadContentType;
}
return contentType;
}
public String getPayloadEncoding() {
String contentType = getPayloadContentHeader();
if (contentType == null) {
return contentType;
}
Pattern pattern = Pattern.compile(".*;\\s*charset\\s*=\\s*([^;]+).*");
Matcher matcher = pattern.matcher(contentType);
if (matcher.matches()) {
return matcher.group(1).toLowerCase();
}
return null;
}
public class HttpStatusLine {
private String line;
private String version;
private String status;
private String reason;
public String getLine() {
return line;
}
public String getVersion() {
return version;
}
public String getStatus() {
return status;
}
public String getReason() {
return reason;
}
protected HttpStatusLine(String line) {
this.line = line;
String[] tokens = line.split(" ", 3);
this.version = tokens[0];
this.status = tokens[1];
this.reason = tokens[2];
}
}
public class HttpRequestLine {
private String line;
private String version;
private String method;
private String uri;
public String getLine() {
return line;
}
public String getVersion() {
return version;
}
public String getMethod() {
return method;
}
public String getUri() {
return uri;
}
public HttpRequestLine(String line) {
this.line = line;
String[] tokens = line.split(" ", 3);
this.method = tokens[0];
this.uri = tokens[1];
this.version = tokens[2];
}
}
}

View File

@ -1,103 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: vdv
* Date: 25 avr. 2012
* Time: 17:50:01
* To change this template use File | Settings | File Templates.
*/
public class WarcRecordHeader implements Iterator<WarcField> {
public static String WARC_TYPE = "WARC-Type";
public static String CONTENT_TYPE = "Content-Type";
public static String CONTENT_LENGTH = "Content-Length";
private WarcRecord warcRecord;
private String line;
private Exception e;
private Map<String,String> headers;
private boolean endOfHeader = false;
public WarcRecordHeader(WarcRecord warcRecord) {
this.warcRecord = warcRecord;
headers = new Hashtable<String, String>();
}
public boolean hasNext() {
if (endOfHeader) {
return false;
}
if (line == null) {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
return false;
}
}
if (line.equals("")) {
endOfHeader = true;
return false;
}
return true;
}
public WarcField next() {
if (endOfHeader) {
return null;
}
if (line == null) {
try {
line = warcRecord.readLine();
} catch (Exception e) {
this.e = e;
return null;
}
}
WarcField item = new WarcField(line);
line = null;
headers.put(item.getKey(), item.getValue());
return item;
}
public String getType() {
return headers.get(WARC_TYPE);
}
public void remove() {
}
public String getContentType() {
return headers.get(CONTENT_TYPE);
}
public int getContentLength() {
return Integer.parseInt(headers.get(CONTENT_LENGTH));
}
public void skipToEnd() {
while (hasNext()) {
next();
}
}
}

View File

@ -1,306 +0,0 @@
/**
* Copyright (C) 2012 Eric van der Vlist.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.owark.warc;
import org.junit.Assert;
import org.junit.Test;
import java.io.*;
/**
* Test cases for WarcParser
*/
public class WarcParserTest {
private static WarcParser warcParser;
@Test
public void testDyomedea() throws IOException, WarcParser.WarcException {
// WARC
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
WarcParser warcParser = new WarcParser(new FileInputStream(file));
Assert.assertEquals(true, warcParser.hasNext());
// RECORD (warcinfo)
WarcRecord record = warcParser.next();
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
Assert.assertNotNull(record);
Assert.assertEquals("WARC/1.0", record.getMagic());
// HEADER
WarcRecordHeader header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
WarcField headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("warcinfo", headerItem.getValue());
Assert.assertEquals("warcinfo", header.getType());
Assert.assertEquals("warcinfo", record.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals("WARC-Date", headerItem.getKey());
Assert.assertEquals("2012-04-23T10:05:24Z", headerItem.getValue());
headerItem = header.next();
headerItem = header.next();
headerItem = header.next();
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals("Content-Length", headerItem.getKey());
Assert.assertEquals("369", headerItem.getValue());
Assert.assertEquals(false, header.hasNext());
headerItem = header.next();
Assert.assertNull(headerItem);
Assert.assertEquals("application/warc-fields", record.getContentType());
Assert.assertEquals(369, record.getContentLength());
// Content
WarcRecordContent content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(true, content.hasFields());
Assert.assertEquals(false, content.isHTTP());
Assert.assertEquals(false, content.hasStatusLine());
Assert.assertEquals(false, content.hasRequestLine());
Assert.assertEquals(true, content.hasNext());
WarcField field = content.next();
Assert.assertEquals(false, content.endOfContent());
Assert.assertNotNull(field);
Assert.assertEquals("software", field.getKey());
Assert.assertEquals("Heritrix/3.1.0 http://crawler.archive.org", field.getValue());
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("http-header-user-agent", field.getKey());
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertNull(content.getPayloadContentType());
Assert.assertNull(content.getPayloadContentHeader());
Assert.assertNull(content.getPayloadEncoding());
Assert.assertEquals(true, content.endOfContent());
// Next record (DNS response)
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("response", headerItem.getValue());
header.skipToEnd();
// Content
content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(false, content.hasFields());
Assert.assertEquals(false, content.isHTTP());
Assert.assertEquals(false, content.hasStatusLine());
Assert.assertEquals(false, content.hasRequestLine());
Assert.assertEquals(false, content.endOfContent());
BufferedReader reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
String line = reader.readLine();
Assert.assertEquals("20120423100524", line);
line = reader.readLine();
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
line = reader.readLine();
Assert.assertEquals(true, content.endOfContent());
Assert.assertEquals("text/dns", content.getPayloadContentType());
Assert.assertEquals("text/dns", content.getPayloadContentHeader());
Assert.assertNull(content.getPayloadEncoding());
Assert.assertNull(line);
// Next record (HTTP response)
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("response", headerItem.getValue());
header.skipToEnd();
// Content
content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(true, content.hasFields());
Assert.assertEquals(true, content.isHTTP());
Assert.assertEquals(false, content.isRequest());
Assert.assertEquals(true, content.hasStatusLine());
Assert.assertEquals(false, content.hasRequestLine());
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
Assert.assertNotNull(status);
Assert.assertEquals("HTTP/1.1 404 Introuvable", status.getLine());
Assert.assertEquals("HTTP/1.1", status.getVersion());
Assert.assertEquals("404", status.getStatus());
Assert.assertEquals("Introuvable", status.getReason());
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("Date", field.getKey());
Assert.assertEquals("Mon, 23 Apr 2012 10:05:27 GMT", field.getValue());
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("Connection", field.getKey());
Assert.assertEquals("close", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertEquals(false, content.endOfContent());
reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
line = reader.readLine();
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
line = reader.readLine();
Assert.assertNull(line);
Assert.assertEquals("text/html", content.getPayloadContentType());
Assert.assertEquals("text/html;charset=utf-8", content.getPayloadContentHeader());
Assert.assertEquals("utf-8", content.getPayloadEncoding());
Assert.assertEquals(true, content.endOfContent());
// Next record
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("request", headerItem.getValue());
header.skipToEnd();
// Content
content = record.getContent();
Assert.assertNotNull(content);
Assert.assertEquals(true, content.hasFields());
Assert.assertEquals(true, content.isHTTP());
Assert.assertEquals(true, content.isRequest());
Assert.assertEquals(false, content.hasStatusLine());
Assert.assertEquals(true, content.hasRequestLine());
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
Assert.assertEquals("GET /robots.txt HTTP/1.0", request.getLine());
Assert.assertEquals("GET", request.getMethod());
Assert.assertEquals("/robots.txt", request.getUri());
Assert.assertEquals("HTTP/1.0", request.getVersion());
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("User-Agent", field.getKey());
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
field = content.next();
field = content.next();
field = content.next();
Assert.assertNotNull(field);
Assert.assertEquals("Host", field.getKey());
Assert.assertEquals("dyomedea.com", field.getValue());
Assert.assertEquals(false, content.hasNext());
Assert.assertEquals(true, content.endOfContent());
// Skip record
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
Assert.assertNotNull(record);
record.skipToEnd();
Assert.assertEquals(true, warcParser.hasNext());
record = warcParser.next();
// Header
header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertNull(header.getType());
Assert.assertEquals(true, header.hasNext());
headerItem = header.next();
Assert.assertNotNull(headerItem);
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
Assert.assertEquals("response", headerItem.getValue());
record.skipToEnd();
// Go to last record
while (warcParser.hasNext()) {
record = warcParser.next();
Assert.assertNotNull(record);
record.skipToEnd();
}
Assert.assertEquals(69, warcParser.getRecordCount());
Assert.assertEquals("metadata", record.getType());
}
@Test
public void skipToEnd() throws IOException, WarcParser.WarcException {
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
WarcParser warcParser = new WarcParser(new FileInputStream(file));
Assert.assertEquals(true, warcParser.hasNext());
WarcRecord record = warcParser.next();
WarcRecordHeader header = record.getHeader();
while (header.hasNext()) {
Assert.assertNotNull(header.next());
}
WarcRecordContent content = record.getContent();
while (content.hasNext()) {
Assert.assertNotNull(content.next());
}
record.skipToEnd();
}
} ;

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
Pipelines in this directory are called by the scheduler.
Their name is the name of the corresponding action.
Inputs:
* data: the action
Outputs: None
These pipelines must take care of removing the action from the queue once they are done.

View File

@ -1,330 +0,0 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="data" type="input"/>
<!-- Look if the resource has already been archived for that set -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>read</operation>
<type>xquery</type>
<parameter name="url" type="string">
<xsl:value-of select="/action/@url"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
boolean(//archive[@url = $(url)])
]]></xquery>
</p:input>
<p:output name="data" id="duplicate" debug="duplicate"/>
</p:processor>
<p:choose href="#duplicate">
<p:when test="/*/* = 'true'">
<!-- Already archived, nothing to do -->
<!-- Update the queue -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response4" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/>
</p:processor>
</p:when>
<p:otherwise>
<!-- Otherwise, archive the resource... -->
<!-- Fetch the resource -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/action/@url"/>
</url>
<header>
<name>User-Agent</name>
<value>
<xsl:value-of select="doc('oxf:/config.xml')/config/user-agent"/>
</value>
</header>
<mode>archive</mode>
</config>
</p:input>
<p:output name="data" id="archive" debug="archive"/>
</p:processor>
<!-- Store the archive in the database -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:value-of select="/action/@filename"/>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" href="#archive"/>
<p:output name="data" id="response2"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response2"/>
</p:processor>
<!-- Test the type of document to see if it needs to be rewritten -->
<p:choose href="#archive">
<!-- HTML document : need to update the links... -->
<p:when test="/archive/response/document/@content-type=('text/html', 'text/css')">
<!-- Call the corresponding pipeline to extract the links and rewrite them -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#archive">
<config xsl:version="2.0">
<url>
<xsl:text>oxf:/actions/mediatypes/</xsl:text>
<xsl:value-of select="substring-after(/archive/response/document/@content-type, '/')"/>
<xsl:text>.xpl</xsl:text>
</url>
</config>
</p:input>
<p:output name="data" id="pipeline"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="#pipeline"/>
<p:input name="archive" href="#archive"/>
<p:output name="rewritten" id="rewritten"/>
<p:output name="links" id="links"/>
</p:processor>
<!-- It's a hack so that the document is not submitted as text through the xforms:submit processor... -->
<p:processor name="oxf:xslt">
<p:input name="config">
<document xsl:version="2.0">
<xsl:copy-of select="/"/>
</document>
</p:input>
<p:input name="data" href="#rewritten"/>
<p:output name="data" id="rewritten-embedded"/>
</p:processor>
<!-- Store the rewritten document in the database -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>rewritten-</xsl:text>
<xsl:value-of select="/action/@filename"/>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" href="#rewritten-embedded"/>
<p:output name="data" id="response3"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response3"/>
</p:processor>
<!-- Update the archive index -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="url" type="string">
<xsl:value-of select="/action/@url"/>
</parameter>
<parameter name="filename" type="string">
<xsl:value-of select="/action/@filename"/>
</parameter>
<parameter name="filename-rewritten" type="string">
<xsl:text>rewritten-</xsl:text>
<xsl:value-of select="/action/@filename"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $as in /archive-set
return
update
insert <archive url=$(url) href=$(filename) href-rewritten=$(filename-rewritten) dateTime="{current-dateTime()}"/>
into $as
]]></xquery>
</p:input>
<p:output name="data" id="response1"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response1"/>
</p:processor>
<!-- Update the queue -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #links)">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="directory" type="string">
<xsl:value-of select="/root/action/@directory"/>
</parameter>
<parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/>
</parameter>
<parameter name="priority" type="string">
<xsl:value-of select="/root/action/@priority"/>
</parameter>
<parameter name="links" type="node-set">
<xsl:copy-of select="/root/links"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
declare variable $links := $(links);
for $q in /queue[$links/link/@abs-href]
return
update
insert
for $href in distinct-values($links/link/@abs-href)
let $link := $links/link[@abs-href = $href][1]
return <action priority=$(priority) uuid="{util:uuid()}" type="archive-resource" url="{$link/@abs-href}" directory=$(directory) filename="{$link/@filename}"/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response4" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/>
</p:processor>
</p:when>
<!-- Otherwise: no need to rewrite -->
<p:otherwise>
<!-- Update the archive index -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="url" type="string">
<xsl:value-of select="/action/@url"/>
</parameter>
<parameter name="filename" type="string">
<xsl:value-of select="/action/@filename"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $as in /archive-set
return
update
insert <archive url=$(url) href=$(filename) dateTime="{current-dateTime()}"/>
into $as
]]></xquery>
</p:input>
<p:output name="data" id="response1"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response1"/>
</p:processor>
<!-- Update the queue -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response4" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/>
</p:processor>
</p:otherwise>
</p:choose>
</p:otherwise>
</p:choose>
</p:config>

View File

@ -1,96 +0,0 @@
<!--
Create a new archive
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:param name="data" type="input"/>
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#data"/>
<p:input name="config">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="directory" type="string">
<xsl:value-of select="translate(/action/@uuid, '-', '/')"/>
<xsl:text>/</xsl:text>
</parameter>
<parameter name="filename" type="string">
<xsl:value-of select="saxon:string-to-hexBinary(/action/@url, 'utf-8')"/>
<xsl:text>.xml</xsl:text>
</parameter>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
<parameter name="url" type="string">
<xsl:value-of select="/action/@url"/>
</parameter>
<parameter name="priority-resource" type="string">
<xsl:value-of select="/action/@priority + 2"/>
</parameter>
<parameter name="priority-package" type="string">
<xsl:value-of select="/action/@priority + 1"/>
</parameter>
</config>
</p:input>
<p:output name="data" id="data-access-data"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" href="#data-access-data"/>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert (<action priority=$(priority-resource) uuid="{util:uuid()}" type="archive-resource" url=$(url) directory=$(directory) filename=$(filename)/>,
<action priority=$(priority-package) uuid="{util:uuid()}" type="package-archive" directory=$(directory)/>)
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/config/parameter[@name='directory']"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" transform="oxf:xslt" href="#data">
<archive-set xsl:version="2.0">
<xsl:copy-of select="/action/@url|/action/@uuid"/>
</archive-set>
</p:input>
<p:output name="data" id="response2" debug="response2"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response2"/>
</p:processor>
</p:config>

View File

@ -1,695 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
HERITRIX 3 CRAWL JOB CONFIGURATION FILE
This is a relatively minimal configuration suitable for many crawls.
Commented-out beans and properties are provided as an example; values
shown in comments reflect the actual defaults which are in effect
if not otherwise specified specification. (To change from the default
behavior, uncomment AND alter the shown values.)
-->
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:aop="http://www.springframework.org/schema/aop"
xmlns:tx="http://www.springframework.org/schema/tx"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">
<context:annotation-config/>
<!--
OVERRIDES
Values elsewhere in the configuration may be replaced ('overridden')
by a Properties map declared in a PropertiesOverrideConfigurer,
using a dotted-bean-path to address individual bean properties.
This allows us to collect a few of the most-often changed values
in an easy-to-edit format here at the beginning of the model
configuration.
-->
<!-- overrides from a text property list -->
<bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
<property name="properties">
<value>
# This Properties map is specified in the Java 'property list' text format
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
metadata.operatorContactUrl=http://owark.org
metadata.jobName=basic
metadata.description=Basic crawl starting with useful defaults
##..more?..##
</value>
</property>
</bean>
<!-- overrides from declared <prop> elements, more easily allowing
multiline values or even declared beans -->
<bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
<property name="properties">
<props>
<prop key="seeds.textSource.value">
# URLS HERE
<url xmlns=""/>
</prop>
</props>
</property>
</bean>
<!-- CRAWL METADATA: including identification of crawler/operator -->
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName">
<property name="operatorContactUrl" value="[see override above]"/>
<property name="jobName" value="[see override above]"/>
<property name="description" value="[see override above]"/>
<!-- <property name="robotsPolicyName" value="obey"/> -->
<!-- <property name="operator" value=""/> -->
<!-- <property name="operatorFrom" value=""/> -->
<!-- <property name="organization" value=""/> -->
<!-- <property name="audience" value=""/> -->
<!-- <property name="userAgentTemplate"
value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->
</bean>
<!-- SEEDS: crawl starting points
ConfigString allows simple, inline specification of a moderate
number of seeds; see below comment for example of using an
arbitrarily-large external file. -->
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
<property name="textSource">
<bean class="org.archive.spring.ConfigString">
<property name="value">
<value>
# [see override above]
</value>
</property>
</bean>
</property>
<!-- <property name='sourceTagSeeds' value='false'/> -->
<!-- <property name='blockAwaitingSeedLines' value='-1'/> -->
</bean>
<!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
the job directory, similar to the H1 approach.
Use either the above, or this, but not both. -->
<!--
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
<property name="textSource">
<bean class="org.archive.spring.ConfigFile">
<property name="path" value="seeds.txt" />
</bean>
</property>
<property name='sourceTagSeeds' value='false'/>
<property name='blockAwaitingSeedLines' value='-1'/>
</bean>
-->
<!-- SCOPE: rules for which discovered URIs to crawl; order is very
important because last decision returned other than 'NONE' wins. -->
<bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">
<!-- <property name="logToFile" value="false" /> -->
<property name="rules">
<list>
<!-- Begin by REJECTing all... -->
<bean class="org.archive.modules.deciderules.RejectDecideRule">
</bean>
<!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
<!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
<!-- <property name="alsoCheckVia" value="false" /> -->
<!-- <property name="surtsSourceFile" value="" /> -->
<!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> -->
<!-- <property name="surtsSource">
<bean class="org.archive.spring.ConfigString">
<property name="value">
<value>
# example.com
# http://www.example.edu/path1/
# +http://(org,example,
</value>
</property>
</bean>
</property> -->
</bean>
<!-- ...but REJECT those more than a configured link-hop-count from start... -->
<bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">
<property name="maxHops" value="0" />
</bean>
<!-- ...but ACCEPT those more than a configured link-hop-count from start... -->
<bean class="org.archive.modules.deciderules.TransclusionDecideRule">
<!-- <property name="maxTransHops" value="2" /> -->
<!-- <property name="maxSpeculativeHops" value="1" /> -->
</bean>
<!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
<property name="decision" value="REJECT"/>
<property name="seedsAsSurtPrefixes" value="false"/>
<property name="surtsDumpFile" value="${launchId}/negative-surts.dump" />
<!-- <property name="surtsSource">
<bean class="org.archive.spring.ConfigFile">
<property name="path" value="negative-surts.txt" />
</bean>
</property> -->
</bean>
<!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
<bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
<property name="decision" value="REJECT"/>
<!-- <property name="listLogicalOr" value="true" /> -->
<!-- <property name="regexList">
<list>
</list>
</property> -->
</bean>
<!-- ...and REJECT those with suspicious repeating path-segments... -->
<bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">
<!-- <property name="maxRepetitions" value="2" /> -->
</bean>
<!-- ...and REJECT those with more than threshold number of path-segments... -->
<bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
<!-- <property name="maxPathDepth" value="20" /> -->
</bean>
<!-- ...but always ACCEPT those marked as prerequisitee for another URI... -->
<bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
</bean>
<!-- ...but always REJECT those with unsupported URI schemes -->
<bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
</bean>
</list>
</property>
</bean>
<!--
PROCESSING CHAINS
Much of the crawler's work is specified by the sequential
application of swappable Processor modules. These Processors
are collected into three 'chains'. The CandidateChain is applied
to URIs being considered for inclusion, before a URI is enqueued
for collection. The FetchChain is applied to URIs when their
turn for collection comes up. The DispositionChain is applied
after a URI is fetched and analyzed/link-extracted.
-->
<!-- CANDIDATE CHAIN -->
<!-- first, processors are declared as top-level named beans -->
<bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
</bean>
<bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
<!-- <property name="preferenceDepthHops" value="-1" /> -->
<!-- <property name="preferenceEmbedHops" value="1" /> -->
<!-- <property name="canonicalizationPolicy">
<ref bean="canonicalizationPolicy" />
</property> -->
<!-- <property name="queueAssignmentPolicy">
<ref bean="queueAssignmentPolicy" />
</property> -->
<!-- <property name="uriPrecedencePolicy">
<ref bean="uriPrecedencePolicy" />
</property> -->
<!-- <property name="costAssignmentPolicy">
<ref bean="costAssignmentPolicy" />
</property> -->
</bean>
<!-- now, processors are assembled into ordered CandidateChain bean -->
<bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
<property name="processors">
<list>
<!-- apply scoping rules to each individual candidate URI... -->
<ref bean="candidateScoper"/>
<!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
<ref bean="preparer"/>
</list>
</property>
</bean>
<!-- FETCH CHAIN -->
<!-- first, processors are declared as top-level named beans -->
<bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
<!-- <property name="recheckScope" value="false" /> -->
<!-- <property name="blockAll" value="false" /> -->
<!-- <property name="blockByRegex" value="" /> -->
<!-- <property name="allowByRegex" value="" /> -->
</bean>
<bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">
<!-- <property name="ipValidityDurationSeconds" value="21600" /> -->
<!-- <property name="robotsValidityDurationSeconds" value="86400" /> -->
<!-- <property name="calculateRobotsOnly" value="false" /> -->
</bean>
<bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">
<!-- <property name="acceptNonDnsResolves" value="false" /> -->
<!-- <property name="digestContent" value="true" /> -->
<!-- <property name="digestAlgorithm" value="sha1" /> -->
</bean>
<bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois">
<property name="specialQueryTemplates">
<map>
<entry key="whois.verisign-grs.com" value="domain %s" />
<entry key="whois.arin.net" value="z + %s" />
<entry key="whois.denic.de" value="-T dn %s" />
</map>
</property>
</bean>
<bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">
<!-- <property name="useHTTP11" value="false" /> -->
<!-- <property name="maxLengthBytes" value="0" /> -->
<!-- <property name="timeoutSeconds" value="1200" /> -->
<!-- <property name="maxFetchKBSec" value="0" /> -->
<!-- <property name="defaultEncoding" value="ISO-8859-1" /> -->
<!-- <property name="shouldFetchBodyRule">
<bean class="org.archive.modules.deciderules.AcceptDecideRule"/>
</property> -->
<!-- <property name="soTimeoutMs" value="20000" /> -->
<!-- <property name="sendIfModifiedSince" value="true" /> -->
<!-- <property name="sendIfNoneMatch" value="true" /> -->
<!-- <property name="sendConnectionClose" value="true" /> -->
<!-- <property name="sendReferer" value="true" /> -->
<!-- <property name="sendRange" value="false" /> -->
<!-- <property name="ignoreCookies" value="false" /> -->
<!-- <property name="sslTrustLevel" value="OPEN" /> -->
<!-- <property name="acceptHeaders">
<list>
<value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
</list>
</property>
-->
<!-- <property name="httpBindAddress" value="" /> -->
<!-- <property name="httpProxyHost" value="" /> -->
<!-- <property name="httpProxyPort" value="0" /> -->
<!-- <property name="httpProxyUser" value="" /> -->
<!-- <property name="httpProxyPassword" value="" /> -->
<!-- <property name="digestContent" value="true" /> -->
<!-- <property name="digestAlgorithm" value="sha1" /> -->
</bean>
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
</bean>
<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
<!-- <property name="extractJavascript" value="true" /> -->
<!-- <property name="extractValueAttributes" value="true" /> -->
<!-- <property name="ignoreFormActionUrls" value="false" /> -->
<!-- <property name="extractOnlyFormGets" value="true" /> -->
<!-- <property name="treatFramesAsEmbedLinks" value="true" /> -->
<!-- <property name="ignoreUnexpectedHtml" value="true" /> -->
<!-- <property name="maxElementLength" value="1024" /> -->
<!-- <property name="maxAttributeNameLength" value="1024" /> -->
<!-- <property name="maxAttributeValueLength" value="16384" /> -->
</bean>
<bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
</bean>
<bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">
</bean>
<bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
</bean>
<!-- now, processors are assembled into ordered FetchChain bean -->
<bean id="fetchProcessors" class="org.archive.modules.FetchChain">
<property name="processors">
<list>
<!-- re-check scope, if so enabled... -->
<ref bean="preselector"/>
<!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
<ref bean="preconditions"/>
<!-- ...fetch if DNS URI... -->
<ref bean="fetchDns"/>
<ref bean="fetchWhois"/>
<!-- ...fetch if HTTP URI... -->
<ref bean="fetchHttp"/>
<!-- ...extract outlinks from HTTP headers... -->
<ref bean="extractorHttp"/>
<!-- ...extract outlinks from HTML content... -->
<ref bean="extractorHtml"/>
<!-- ...extract outlinks from CSS content... -->
<ref bean="extractorCss"/>
<!-- ...extract outlinks from Javascript content... -->
<ref bean="extractorJs"/>
<!-- ...extract outlinks from Flash content... -->
<ref bean="extractorSwf"/>
</list>
</property>
</bean>
<!-- DISPOSITION CHAIN -->
<!-- first, processors are declared as top-level named beans -->
<bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
<property name="compress" value="false" />
<!-- <property name="prefix" value="IAH" /> -->
<!-- <property name="suffix" value="${HOSTNAME}" /> -->
<!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
<!-- <property name="poolMaxActive" value="1" /> -->
<!-- <property name="MaxWaitForIdleMs" value="500" /> -->
<!-- <property name="skipIdenticalDigests" value="false" /> -->
<!-- <property name="maxTotalBytesToWrite" value="0" /> -->
<!-- <property name="directory" value="${launchId}" /> -->
<!-- <property name="storePaths">
<list>
<value>warcs</value>
</list>
</property> -->
<!-- <property name="writeRequests" value="true" /> -->
<!-- <property name="writeMetadata" value="true" /> -->
<!-- <property name="writeRevisitForIdenticalDigests" value="true" /> -->
<!-- <property name="writeRevisitForNotModified" value="true" /> -->
</bean>
<bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
<!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
</bean>
<bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">
<!-- <property name="delayFactor" value="5.0" /> -->
<!-- <property name="minDelayMs" value="3000" /> -->
<!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> -->
<!-- <property name="maxDelayMs" value="30000" /> -->
<!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
</bean>
<!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor">
<property name="rescheduleDelaySeconds" value="-1" />
</bean> -->
<!-- now, processors are assembled into ordered DispositionChain bean -->
<bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
<property name="processors">
<list>
<!-- write to aggregate archival files... -->
<ref bean="warcWriter"/>
<!-- ...send each outlink candidate URI to CandidateChain,
and enqueue those ACCEPTed to the frontier... -->
<ref bean="candidates"/>
<!-- ...then update stats, shared-structures, frontier decisions -->
<ref bean="disposition"/>
<!-- <ref bean="rescheduler" /> -->
</list>
</property>
</bean>
<!-- CRAWLCONTROLLER: Control interface, unifying context -->
<bean id="crawlController"
class="org.archive.crawler.framework.CrawlController">
<!-- <property name="maxToeThreads" value="25" /> -->
<!-- <property name="pauseAtStart" value="true" /> -->
<!-- <property name="runWhileEmpty" value="false" /> -->
<!-- <property name="recorderInBufferBytes" value="524288" /> -->
<!-- <property name="recorderOutBufferBytes" value="16384" /> -->
<!-- <property name="scratchDir" value="scratch" /> -->
</bean>
<!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->
<bean id="frontier"
class="org.archive.crawler.frontier.BdbFrontier">
<!-- <property name="queueTotalBudget" value="-1" /> -->
<!-- <property name="balanceReplenishAmount" value="3000" /> -->
<!-- <property name="errorPenaltyAmount" value="100" /> -->
<!-- <property name="precedenceFloor" value="255" /> -->
<!-- <property name="queuePrecedencePolicy">
<bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" />
</property> -->
<!-- <property name="snoozeLongMs" value="300000" /> -->
<!-- <property name="retryDelaySeconds" value="900" /> -->
<!-- <property name="maxRetries" value="30" /> -->
<!-- <property name="recoveryLogEnabled" value="true" /> -->
<!-- <property name="maxOutlinks" value="6000" /> -->
<!-- <property name="extractIndependently" value="false" /> -->
<!-- <property name="outbound">
<bean class="java.util.concurrent.ArrayBlockingQueue">
<constructor-arg value="200"/>
<constructor-arg value="true"/>
</bean>
</property> -->
<!-- <property name="inbound">
<bean class="java.util.concurrent.ArrayBlockingQueue">
<constructor-arg value="40000"/>
<constructor-arg value="true"/>
</bean>
</property> -->
<!-- <property name="dumpPendingAtClose" value="false" /> -->
</bean>
<!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs -->
<bean id="uriUniqFilter"
class="org.archive.crawler.util.BdbUriUniqFilter">
</bean>
<!--
EXAMPLE SETTINGS OVERLAY SHEETS
Sheets allow some settings to vary by context - usually by URI context,
so that different sites or sections of sites can be treated differently.
Here are some example Sheets for common purposes. The SheetOverlaysManager
(below) automatically collects all Sheet instances declared among the
original beans, but others can be added during the crawl via the scripting
interface.
-->
<!-- forceRetire: any URI to which this sheet's settings are applied
will force its containing queue to 'retired' status. -->
<bean id='forceRetire' class='org.archive.spring.Sheet'>
<property name='map'>
<map>
<entry key='disposition.forceRetire' value='true'/>
</map>
</property>
</bean>
<!-- smallBudget: any URI to which this sheet's settings are applied
will give its containing queue small values for balanceReplenishAmount
(causing it to have shorter 'active' periods while other queues are
waiting) and queueTotalBudget (causing the queue to enter 'retired'
status once that expenditure is reached by URI attempts and errors) -->
<bean id='smallBudget' class='org.archive.spring.Sheet'>
<property name='map'>
<map>
<entry key='frontier.balanceReplenishAmount' value='20'/>
<entry key='frontier.queueTotalBudget' value='100'/>
</map>
</property>
</bean>
<!-- veryPolite: any URI to which this sheet's settings are applied
will cause its queue to take extra-long politeness snoozes -->
<bean id='veryPolite' class='org.archive.spring.Sheet'>
<property name='map'>
<map>
<entry key='disposition.delayFactor' value='10'/>
<entry key='disposition.minDelayMs' value='10000'/>
<entry key='disposition.maxDelayMs' value='1000000'/>
<entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/>
</map>
</property>
</bean>
<!-- highPrecedence: any URI to which this sheet's settings are applied
will give its containing queue a slightly-higher than default
queue precedence value. That queue will then be preferred over
other queues for active crawling, never waiting behind lower-
precedence queues. -->
<bean id='highPrecedence' class='org.archive.spring.Sheet'>
<property name='map'>
<map>
<entry key='frontier.balanceReplenishAmount' value='20'/>
<entry key='frontier.queueTotalBudget' value='100'/>
</map>
</property>
</bean>
<!--
EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
A SheetAssociation says certain URIs should have certain overlay Sheets
applied. This example applies two sheets to URIs matching two SURT-prefixes.
New associations may also be added mid-crawl using the scripting facility.
-->
<!--
<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
<property name='surtPrefixes'>
<list>
<value>http://(org,example,</value>
<value>http://(com,example,www,)/</value>
</list>
</property>
<property name='targetSheetNames'>
<list>
<value>veryPolite</value>
<value>smallBudget</value>
</list>
</property>
</bean>
-->
<!--
OPTIONAL BUT RECOMMENDED BEANS
-->
<!-- ACTIONDIRECTORY: disk directory for mid-crawl operations
Running job will watch directory for new files with URIs,
scripts, and other data to be processed during a crawl. -->
<bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">
<!-- <property name="actionDir" value="action" /> -->
<!-- <property name="doneDir" value="${launchId}/actions-done" /> -->
<!-- <property name="initialDelaySeconds" value="10" /> -->
<!-- <property name="delaySeconds" value="30" /> -->
</bean>
<!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->
<bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">
<!-- <property name="maxBytesDownload" value="0" /> -->
<!-- <property name="maxDocumentsDownload" value="0" /> -->
<!-- <property name="maxTimeSeconds" value="0" /> -->
</bean>
<!-- CHECKPOINTSERVICE: checkpointing assistance -->
<bean id="checkpointService"
class="org.archive.crawler.framework.CheckpointService">
<!-- <property name="checkpointIntervalMinutes" value="-1"/> -->
<!-- <property name="checkpointsDir" value="checkpoints"/> -->
</bean>
<!--
OPTIONAL BEANS
Uncomment and expand as needed, or if non-default alternate
implementations are preferred.
-->
<!-- CANONICALIZATION POLICY -->
<!--
<bean id="canonicalizationPolicy"
class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy">
<property name="rules">
<list>
<bean class="org.archive.modules.canonicalize.LowercaseRule" />
<bean class="org.archive.modules.canonicalize.StripUserinfoRule" />
<bean class="org.archive.modules.canonicalize.StripWWWNRule" />
<bean class="org.archive.modules.canonicalize.StripSessionIDs" />
<bean class="org.archive.modules.canonicalize.StripSessionCFIDs" />
<bean class="org.archive.modules.canonicalize.FixupQueryString" />
</list>
</property>
</bean>
-->
<!-- QUEUE ASSIGNMENT POLICY -->
<!--
<bean id="queueAssignmentPolicy"
class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy">
<property name="forceQueueAssignment" value="" />
<property name="deferToPrevious" value="true" />
<property name="parallelQueues" value="1" />
</bean>
-->
<!-- URI PRECEDENCE POLICY -->
<!--
<bean id="uriPrecedencePolicy"
class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">
</bean>
-->
<!-- COST ASSIGNMENT POLICY -->
<!--
<bean id="costAssignmentPolicy"
class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">
</bean>
-->
<!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials -->
<!--
<bean id="credentialStore"
class="org.archive.modules.credential.CredentialStore">
</bean>
-->
<!-- DISK SPACE MONITOR:
Pauses the crawl if disk space at monitored paths falls below minimum threshold -->
<!--
<bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor">
<property name="pauseThresholdMiB" value="500" />
<property name="monitorConfigPaths" value="true" />
<property name="monitorPaths">
<list>
<value>PATH</value>
</list>
</property>
</bean>
-->
<!--
REQUIRED STANDARD BEANS
It will be very rare to replace or reconfigure the following beans.
-->
<!-- STATISTICSTRACKER: standard stats/reporting collector -->
<bean id="statisticsTracker"
class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">
<!-- <property name="reports">
<list>
<bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" />
<bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" />
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" />
<bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" />
<bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" />
<bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" />
<bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" />
<bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" />
<bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" />
<bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" />
</list>
</property> -->
<!-- <property name="reportsDir" value="${launchId}/reports" /> -->
<!-- <property name="liveHostReportSize" value="20" /> -->
<!-- <property name="intervalSeconds" value="20" /> -->
<!-- <property name="keepSnapshotsCount" value="5" /> -->
<!-- <property name="liveHostReportSize" value="20" /> -->
</bean>
<!-- CRAWLERLOGGERMODULE: shared logging facility -->
<bean id="loggerModule"
class="org.archive.crawler.reporting.CrawlerLoggerModule">
<!-- <property name="path" value="${launchId}/logs" /> -->
<!-- <property name="crawlLogPath" value="crawl.log" /> -->
<!-- <property name="alertsLogPath" value="alerts.log" /> -->
<!-- <property name="progressLogPath" value="progress-statistics.log" /> -->
<!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> -->
<!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> -->
<!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> -->
<!-- <property name="logExtraInfo" value="false" /> -->
</bean>
<!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
Autowired to include any SheetForSurtPrefix or
SheetForDecideRuled beans -->
<bean id="sheetOverlaysManager" autowire="byType"
class="org.archive.crawler.spring.SheetOverlaysManager">
</bean>
<!-- BDBMODULE: shared BDB-JE disk persistence manager -->
<bean id="bdb"
class="org.archive.bdb.BdbModule">
<!-- <property name="dir" value="state" /> -->
<!-- <property name="cachePercent" value="60" /> -->
<!-- <property name="useSharedCache" value="true" /> -->
<!-- <property name="expectedConcurrency" value="25" /> -->
</bean>
<!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
<bean id="cookieStorage"
class="org.archive.modules.fetcher.BdbCookieStorage">
<!-- <property name="cookiesLoadFile"><null/></property> -->
<!-- <property name="cookiesSaveFile"><null/></property> -->
<!-- <property name="bdb">
<ref bean="bdb"/>
</property> -->
</bean>
<!-- SERVERCACHE: shared cache of server/host info -->
<bean id="serverCache"
class="org.archive.modules.net.BdbServerCache">
<!-- <property name="bdb">
<ref bean="bdb"/>
</property> -->
</bean>
<!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative
to crawler-beans.cxml file, and tracking crawl files for web UI -->
<bean id="configPathConfigurer"
class="org.archive.spring.ConfigPathConfigurer">
</bean>
</beans>

View File

@ -1,20 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0">
<xsl:variable name="action" select="/action"/>
<xsl:template match="/">
<xsl:apply-templates select="doc('crawler-beans-template.cxml')/*"/>
</xsl:template>
<xsl:template match="@* | node()">
<xsl:copy>
<xsl:apply-templates select="@* | node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="url">
<xsl:value-of select="$action/@url"/>
</xsl:template>
</xsl:stylesheet>

View File

@ -1,223 +0,0 @@
<!--
Check if a job is terminated and get its WARC archive
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:param name="data" type="input"/>
<!-- Get the job -->
<p:processor name="oxf:xforms-submission">
<p:input name="submission" transform="oxf:xslt" href="#data">
<xforms:submission xsl:version="2.0" method="get" action="{/action/@heritrix-job-url}" xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}"
xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
<xforms:header combine="replace">
<xforms:name>Accept</xforms:name>
<xforms:value>application/xml</xforms:value>
</xforms:header>
</xforms:submission>
</p:input>
<p:input name="request">
<instance/>
</p:input>
<p:output name="response" id="heritrix-job" debug="heritrix-job"/>
</p:processor>
<p:choose href="#heritrix-job">
<p:when test="/job/crawlControllerState='FINISHED'">
<!-- The job is finished, we can get its archive... -->
<!-- Scan the directory to find the name of the WARC file -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#heritrix-job">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/job/configFiles/value[key='warcWriter.storePaths[0]']/url"/>
</url>
<authentication>
<username>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
</username>
<password>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
</password>
<preemptive>false</preemptive>
</authentication>
</config>
</p:input>
<p:output name="data" id="warc-dir-list" debug="warc-dir-list"/>
</p:processor>
<!-- Next action: package -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/>
</parameter>
<parameter name="url" type="string">
<xsl:value-of select="/root/action/@url"/>
</parameter>
<parameter name="directory" type="string">
<xsl:value-of select="/root/action/@directory"/>
</parameter>
<parameter name="heritrix-job-url" type="string">
<xsl:value-of select="/root/action/@heritrix-job-url"/>
</parameter>
<parameter name="priority" type="string">
<xsl:value-of select="/root/action/@priority"/>
</parameter>
<parameter name="warc-url" type="string">
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
</parameter>
<parameter name="log-url" type="string">
<xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
</p:when>
<p:otherwise>
<!-- The job is not finished yet, we'll check later on... -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
<parameter name="next-time" type="string">
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return
update value $a/@after with $(next-time)
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
</p:otherwise>
</p:choose>
<!-- <p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="aggregate('root', #data, #heritrix-engine, #heritrix-unpaused)"/>
<p:input name="config">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="directory" type="string">
<xsl:value-of select="translate(/root/action/@uuid, '-', '/')"/>
<xsl:text>/</xsl:text>
</parameter>
<parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/>
</parameter>
<parameter name="url" type="string">
<xsl:value-of select="/root/action/@url"/>
</parameter>
<parameter name="priority-warc" type="string">
<xsl:value-of select="/root/action/@priority + 1"/>
</parameter>
<parameter name="next-time" type="string">
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
</parameter>
<parameter name="heritrix-job-url" type="string">
<xsl:value-of select="/root/engine/jobs/value[shortName=/root/action/@uuid]/url"/>
</parameter>
</config>
</p:input>
<p:output name="data" id="data-access-data"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/config/parameter[@name='directory']"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" transform="oxf:xslt" href="#data-access-data">
<archive-set xsl:version="2.0" url="{/config/parameter[@name='url']}" uuid="{/config/parameter[@name='uuid']}">
<heritrix-job url="{/config/parameter[@name='heritrix-job-url']}"/>
</archive-set>
</p:input>
<p:output name="data" id="response2" debug="response2"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response2"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" href="#data-access-data"/>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority=$(priority-warc) uuid="{util:uuid()}" type="get-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) after=$(next-time)/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>-->
</p:config>

View File

@ -1,196 +0,0 @@
<!--
Create a new archive through Heritrix
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:param name="data" type="input"/>
<!-- Create a new Heritrix job-->
<p:processor name="oxf:xforms-submission">
<p:input name="submission" transform="oxf:xslt" href="oxf:/config.xml">
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/config/heritrix/rest-api}" xxforms:username="{/config/heritrix/username}"
xxforms:password="{/config/heritrix/password}" xxforms:preemptive-authentication="false">
<xforms:header combine="replace">
<xforms:name>Accept</xforms:name>
<xforms:value>application/xml</xforms:value>
</xforms:header>
</xforms:submission>
</p:input>
<p:input name="request" transform="oxf:xslt" href="#data">
<instance xsl:version="2.0">
<action>create</action>
<createpath>
<xsl:value-of select="/action/@uuid"/>
</createpath>
</instance>
</p:input>
<p:output name="response" id="heritrix-engine" debug="heritrix-engine"/>
</p:processor>
<!-- Create a job configuration -->
<p:processor name="oxf:xslt">
<p:input name="data" href="#data"/>
<p:input name="config" href="cxml.xslt"/>
<p:output name="data" id="cxml"/>
</p:processor>
<!-- Upload the job configuration -->
<p:processor name="oxf:xforms-submission">
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine)">
<xforms:submission xsl:version="2.0" method="put" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/primaryConfigUrl}"
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')//config/heritrix/password}" xxforms:preemptive-authentication="false"/>
</p:input>
<p:input name="request" href="#cxml"/>
<p:output name="response" id="cxml-response" debug="cxml-response"/>
</p:processor>
<!-- Build the job -->
<p:processor name="oxf:xforms-submission">
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #cxml-response)">
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
<xforms:header combine="replace">
<xforms:name>Accept</xforms:name>
<xforms:value>application/xml</xforms:value>
</xforms:header>
</xforms:submission>
</p:input>
<p:input name="request" transform="oxf:xslt" href="#data">
<instance xsl:version="2.0">
<action>build</action>
</instance>
</p:input>
<p:output name="response" id="heritrix-built" debug="heritrix-built"/>
</p:processor>
<!-- Launch the job -->
<p:processor name="oxf:xforms-submission">
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #heritrix-built)">
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
<xforms:header combine="replace">
<xforms:name>Accept</xforms:name>
<xforms:value>application/xml</xforms:value>
</xforms:header>
</xforms:submission>
</p:input>
<p:input name="request" transform="oxf:xslt" href="#data">
<instance xsl:version="2.0">
<action>launch</action>
</instance>
</p:input>
<p:output name="response" id="heritrix-launched" debug="heritrix-launched"/>
</p:processor>
<!-- Unpause the job -->
<p:processor name="oxf:xforms-submission">
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #heritrix-launched)">
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
<xforms:header combine="replace">
<xforms:name>Accept</xforms:name>
<xforms:value>application/xml</xforms:value>
</xforms:header>
</xforms:submission>
</p:input>
<p:input name="request" transform="oxf:xslt" href="#data">
<instance xsl:version="2.0">
<action>unpause</action>
</instance>
</p:input>
<p:output name="response" id="heritrix-unpaused" debug="heritrix-unpaused"/>
</p:processor>
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="aggregate('root', #data, #heritrix-engine, #heritrix-unpaused)"/>
<p:input name="config">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="directory" type="string">
<xsl:value-of select="translate(/root/action/@uuid, '-', '/')"/>
<xsl:text>/</xsl:text>
</parameter>
<parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/>
</parameter>
<parameter name="url" type="string">
<xsl:value-of select="/root/action/@url"/>
</parameter>
<parameter name="priority-warc" type="string">
<xsl:value-of select="/root/action/@priority + 1"/>
</parameter>
<parameter name="next-time" type="string">
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
</parameter>
<parameter name="heritrix-job-url" type="string">
<xsl:value-of select="/root/engine/jobs/value[shortName=/root/action/@uuid]/url"/>
</parameter>
</config>
</p:input>
<p:output name="data" id="data-access-data"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/config/parameter[@name='directory']"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" transform="oxf:xslt" href="#data-access-data">
<archive-set xsl:version="2.0" url="{/config/parameter[@name='url']}" uuid="{/config/parameter[@name='uuid']}">
<heritrix-job url="{/config/parameter[@name='heritrix-job-url']}"/>
</archive-set>
</p:input>
<p:output name="data" id="response2" debug="response2"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response2"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" href="#data-access-data"/>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority=$(priority-warc) uuid="{util:uuid()}" type="get-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) after=$(next-time)/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
</p:config>

View File

@ -1,13 +0,0 @@
Pipelines in this directory are called by the archive-resource pipeline.
Their name is the name of the media subtype.
Inputs:
* archive: the archive
Outputs:
* rewritten: the rewritten version of the document
* links: the list of rewritten links

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" xmlns:owk="http://owark.org/xslt/"
exclude-result-prefixes="xs xd owk" version="2.0">
<xd:doc scope="stylesheet">
<xd:desc>
<xd:p><xd:b>Created on:</xd:b> May 4, 2012</xd:p>
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
<xd:p>Common functions and template for URL rewriting</xd:p>
</xd:desc>
</xd:doc>
<xsl:function name="owk:is-relative" as="xs:boolean">
<xsl:param name="url" as="xs:string"/>
<xsl:sequence select="not(substring-before($url, ':') = ('http', 'https'))"/>
</xsl:function>
<xsl:function name="owk:safer-resolve-uri" as="xs:string">
<xsl:param name="relative" as="xs:string"/>
<xsl:param name="hbase" as="xs:string"/>
<xsl:sequence select="if (owk:is-relative($relative)) then resolve-uri($relative, $hbase) else $relative"/>
</xsl:function>
<xsl:function name="owk:url-rewrite" as="xs:string">
<xsl:param name="url" as="xs:string"/>
<xsl:variable name="no-fragment" select="substring-before(concat($url, '#'), '#')"/>
<xsl:variable name="abs" select="owk:safer-resolve-uri($no-fragment, $base) cast as xs:string"/>
<xsl:variable name="local-name" select="$index/resource[(for $u in (uri, same-as) return $u cast as xs:string) = $abs][1]/local-name"/>
<xsl:message>local-name: <xsl:value-of select="$local-name"/></xsl:message>
<xsl:sequence select="if ($local-name) then concat(if ($resource/uri/@seed = 'false') then '../' else '', $local-name) else owk:safer-resolve-uri($url, $base)"/>
</xsl:function>
<xsl:variable name="index" select="doc('input:index')/*"/>
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
<xsl:variable name="base" select="$resource/uri"/>
</xsl:stylesheet>

View File

@ -1,106 +0,0 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="archive" type="input"/>
<p:param name="rewritten" type="output"/>
<p:param name="links" type="output"/>
<!-- Store the document -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>session</scope>
</config>
</p:input>
<p:input name="data" href="#archive#xpointer(/archive/response/document)"/>
<p:output name="data" id="url-written"/>
</p:processor>
<!-- And read it as CSS -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#url-written">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/*"/>
</url>
<content-type>text/css</content-type>
<mode>text</mode>
</config>
</p:input>
<p:output name="data" id="css" debug="css"/>
</p:processor>
<!-- Get a list of links to update -->
<!-- TODO: support links in inline CSS -->
<!-- TODO: support iframes and objects -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#css"/>
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="base" select="doc('input:request')/request/location"/>
<xsl:template match="/">
<links>
<xsl:variable name="links" as="node()*">
<xsl:analyze-string select="document" regex="url\([&quot;']?([^)'&quot;]+)[&quot;']?\)" flags="">
<xsl:matching-substring>
<link href="{regex-group(1)}"/>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:variable>
<xsl:for-each-group select="$links" group-by="@href">
<xsl:variable name="abs-href" select="resolve-uri(@href, $base)"/>
<xsl:variable name="tokens" select="tokenize($abs-href, '/')"/>
<xsl:variable name="last-token" select="$tokens[last()]"/>
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
<xsl:variable name="extension" select="$tokens2[last()]"/>
<link abs-href="{$abs-href}" new-href="{saxon:string-to-hexBinary(substring($abs-href, 1, string-length($abs-href) - string-length($extension) - 1), 'utf-8')}.{$extension}"
filename="{saxon:string-to-hexBinary($abs-href, 'utf-8')}.xml">
<xsl:copy-of select="@*"/>
</link>
</xsl:for-each-group>
</links>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="links-local" debug="links"/>
</p:processor>
<p:processor name="oxf:identity">
<p:input name="data" href="#links-local"/>
<p:output name="data" ref="links"/>
</p:processor>
<!-- Update the links -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#css"/>
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
<p:input name="links" href="#links-local"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="links" select="doc('input:links')/links"/>
<xsl:variable name="base" select="doc('input:request')/request/location"/>
<xsl:key name="link" match="link" use="@href"/>
<xsl:template match="/document">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:analyze-string select="." regex="url\([&quot;']?([^)'&quot;]+)[&quot;']?\)" flags="">
<xsl:matching-substring>
<xsl:text>url(</xsl:text>
<xsl:value-of select="$links/key('link', regex-group(1))/@new-href"/>
<xsl:text>)</xsl:text>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:copy-of select="."/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" ref="rewritten" debug="rewritten"/>
</p:processor>
</p:config>

View File

@ -1,123 +0,0 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="archive" type="input"/>
<p:param name="rewritten" type="output"/>
<p:param name="links" type="output"/>
<!-- Store the document -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>session</scope>
</config>
</p:input>
<p:input name="data" href="#archive#xpointer(/archive/response/document)"/>
<p:output name="data" id="url-written"/>
</p:processor>
<!-- And read it as HTML -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#url-written">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/*"/>
</url>
<mode>html</mode>
</config>
</p:input>
<p:output name="data" id="html" debug="html"/>
</p:processor>
<!-- Get a list of links to update -->
<!-- TODO: support links in inline CSS -->
<!-- TODO: support iframes and objects -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#html"/>
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="base" select="doc('input:request')/request/location"/>
<xsl:template match="/">
<links>
<xsl:variable name="links" as="node()*">
<xsl:apply-templates/>
</xsl:variable>
<xsl:for-each-group select="$links" group-by="@href">
<xsl:variable name="abs-href" select="resolve-uri(@href, $base)"/>
<xsl:variable name="tokens" select="tokenize($abs-href, '/')"/>
<xsl:variable name="last-token" select="$tokens[last()]"/>
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
<xsl:variable name="extension" select="$tokens2[last()]"/>
<link abs-href="{$abs-href}" new-href="{saxon:string-to-hexBinary(substring($abs-href, 1, string-length($abs-href) - string-length($extension) - 1), 'utf-8')}.{$extension}"
filename="{saxon:string-to-hexBinary($abs-href, 'utf-8')}.xml">
<xsl:copy-of select="@*"/>
</link>
</xsl:for-each-group>
</links>
</xsl:template>
<xsl:template match="text()"/>
<xsl:template match="link[@rel='stylesheet']">
<link>
<xsl:copy-of select="@*"/>
</link>
</xsl:template>
<xsl:template match="img">
<link href="{@src}" type="image/*"/>
</xsl:template>
<xsl:template match="script[@src]">
<link href="{@src}" type="{@type}"/>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="links-local" debug="links"/>
</p:processor>
<p:processor name="oxf:identity">
<p:input name="data" href="#links-local"/>
<p:output name="data" ref="links"/>
</p:processor>
<!-- Update the links -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#html"/>
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
<p:input name="links" href="#links-local"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="links" select="doc('input:links')/links"/>
<xsl:variable name="base" select="doc('input:request')/request/location"/>
<xsl:key name="link" match="link" use="@href"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="link[@rel='stylesheet']/@href|img/@src|script/@src">
<xsl:attribute name="{name(.)}">
<xsl:value-of select="$links/key('link', current())/@new-href"/>
</xsl:attribute>
</xsl:template>
<xsl:template match="link[@rel!='stylesheet']/@href|a/@href">
<xsl:attribute name="{name(.)}">
<xsl:value-of select="resolve-uri(., $base)"/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="html-rewritten" debug="rewritten"/>
</p:processor>
<p:processor name="oxf:xml-converter">
<p:input name="config">
<config>
<content-type>application/xml</content-type>
<encoding>utf-8</encoding>
<version>1.0</version>
</config>
</p:input>
<p:input name="data" href="#html-rewritten"/>
<p:output name="data" ref="rewritten"/>
</p:processor>
</p:config>

View File

@ -1,69 +0,0 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="record" type="input"/>
<p:param name="index-entry" type="input"/>
<p:param name="index" type="input"/>
<p:param name="rewritten" type="output"/>
<!-- Store the document -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>session</scope>
</config>
</p:input>
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
<p:output name="data" id="url-written"/>
</p:processor>
<!-- And read it as CSS -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#url-written">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/*"/>
</url>
<content-type>text/css</content-type>
<mode>text</mode>
</config>
</p:input>
<p:output name="data" id="css" debug="css"/>
</p:processor>
<!-- Update the links -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#css"/>
<p:input name="index-entry" href="#index-entry"/>
<p:input name="index" href="#index"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="index" select="doc('input:index')/*"/>
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
<xsl:variable name="base" select="$resource/uri"/>
<xsl:template match="/document">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:analyze-string select="." regex="url\([&quot;']?([^)'&quot;]+)[&quot;']?\)" flags="">
<xsl:matching-substring>
<xsl:text>url(</xsl:text>
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(regex-group(1), $base), '#'), '#')"/>
<xsl:variable name="local-name" select="$index/resource[(uri, same-as) = $abs][1]/local-name"/>
<xsl:value-of select="if ($local-name) then concat('../', $local-name) else resolve-uri(regex-group(1), $base)"/>
<xsl:text>)</xsl:text>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:copy-of select="."/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" ref="rewritten" debug="rewritten"/>
</p:processor>
</p:config>

View File

@ -1,109 +0,0 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="record" type="input"/>
<p:param name="index-entry" type="input"/>
<p:param name="index" type="input"/>
<p:param name="rewritten" type="output"/>
<!-- Try to guess the encoding... -->
<p:processor name="oxf:xslt">
<p:input name="data" href="#record"/>
<p:input name="config">
<encoding xsl:version="2.0">
<xsl:choose>
<xsl:when test="contains(/record/content/headers/header[@name='Content-Type'], 'charset=')">
<xsl:value-of select="substring-before(concat(substring-after(/record/content/headers/header[@name='Content-Type'], 'charset='), ';'), ';')"/>
<xsl:message>
ENCODING :
<xsl:value-of select="substring-before(concat(substring-after(/record/content/headers/header[@name='Content-Type'], 'charset='), ';'), ';')"/>
</xsl:message>
</xsl:when>
<xsl:otherwise>utf-8</xsl:otherwise>
</xsl:choose>
</encoding>
</p:input>
<p:output name="data" id="encoding" debug="encoding"/>
</p:processor>
<!-- Store the document -->
<p:processor name="oxf:file-serializer">
<p:input name="config" transform="oxf:xslt" href="#encoding">
<config xsl:version="2.0">
<scope>session</scope>
<encoding>
<xsl:value-of select="/encoding"/>
</encoding>
<force-encoding>true</force-encoding>
</config>
</p:input>
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
<p:output name="data" id="url-written" debug="url-written"/>
</p:processor>
<!-- And read it as HTML -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="aggregate('root', #url-written, #encoding)">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/root/url"/>
</url>
<encoding>
<xsl:value-of select="/root/encoding"/>
</encoding>
<force-encoding>true</force-encoding>
<content-type>text/html</content-type>
<force-content-type>true</force-content-type>
<mode>html</mode>
</config>
</p:input>
<p:output name="data" id="html" debug="html"/>
</p:processor>
<!-- Update the links -->
<!-- TODO: support links in inline CSS -->
<!-- TODO: support iframes and objects -->
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#html"/>
<p:input name="index-entry" href="#index-entry"/>
<p:input name="index" href="#index"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:variable name="index" select="doc('input:index')/*"/>
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
<xsl:variable name="base" select="$resource/uri"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="link[@rel='stylesheet']/@href|img/@src|script/@src|embed/@src|@background">
<xsl:attribute name="{name(.)}">
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(., $base), '#'), '#')"/>
<xsl:variable name="local-name" select="$index/resource[(uri, same-as) = $abs][1]/local-name"/>
<xsl:value-of select="if ($local-name) then concat(if ($resource/uri/@seed = 'false') then '../' else '', $local-name) else resolve-uri(., $base)"/>
</xsl:attribute>
</xsl:template>
<xsl:template match="link[@rel!='stylesheet']/@href|a/@href">
<xsl:attribute name="{name(.)}">
<xsl:value-of select="resolve-uri(., $base)"/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="html-rewritten" debug="rewritten"/>
</p:processor>
<p:processor name="oxf:html-converter">
<p:input name="config">
<config>
<content-type>text/html</content-type>
<encoding>utf-8</encoding>
</config>
</p:input>
<p:input name="data" href="#html-rewritten"/>
<p:output name="data" ref="rewritten"/>
</p:processor>
</p:config>

View File

@ -1,366 +0,0 @@
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
<p:param name="data" type="input"/>
<!-- Read the archive index -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/action/@directory"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>read</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param">
<empty/>
</p:input>
<p:output name="data" id="index" debug="index"/>
</p:processor>
<!-- Create a WARC file -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>request</scope>
</config>
</p:input>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #index, #data)">
<xsl:stylesheet version="2.0">
<xsl:import href="warc-lib.xsl"/>
<xsl:template match="/">
<xsl:variable name="content" as="node()*">
<record>
<header>
<field>
<name>WARC-Type</name>
<value>warcinfo</value>
</field>
<field>
<name>WARC-Date</name>
<value>
<xsl:value-of select="current-dateTime()"/>
</value>
</field>
<field>
<name>WARC-Record-ID</name>
<value>
<xsl:text>&lt;urn:uuid:</xsl:text>
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
<xsl:text>></xsl:text>
</value>
</field>
<field>
<name>Content-Type</name>
<value>application/warc-fields</value>
</field>
</header>
<block>
<field>
<name>software</name>
<value>Owark 0.3 http://owark.org</value>
</field>
<field>
<name>format</name>
<value>WARC file version 0.18</value>
</field>
</block>
</record>
<!--
software: Heritrix 1.12.0 http://crawler.archive.org
hostname: crawling017.archive.org
ip: 207.241.227.234
isPartOf: testcrawl-20050708
description: testcrawl with WARC output
operator: IA_Admin
http-header-user-agent:
Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org)
format: WARC file version 0.18
conformsTo:
http://www.archive.org/documents/WarcFileFormat-0.18.html-->
</xsl:variable>
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
<xsl:apply-templates select="$content" mode="warc"/>
</document>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="warc" debug="warc"/>
</p:processor>
<!-- Loop over the index to retrieve the documents -->
<p:for-each href="#index" select="/archive-set/archive" id="files" root="files">
<!-- Read the document -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, current())">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/root/action/@directory"/>
<xsl:value-of select="/root/archive/@href"/>
</relpath>
<operation>read</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param">
<empty/>
</p:input>
<p:output name="data" id="document" debug="document"/>
</p:processor>
<!-- Add the request and start of response records -->
<p:processor name="oxf:file-serializer">
<p:input name="config" transform="oxf:xslt" href="#warc">
<config xsl:version="2.0">
<file>
<xsl:value-of select="substring-after(/url, 'file:')"/>
</file>
<make-directories>false</make-directories>
<append>true</append>
</config>
</p:input>
<p:input name="data" transform="oxf:xslt" href="#document">
<xsl:stylesheet version="2.0">
<xsl:import href="warc-lib.xsl"/>
<xsl:template match="/">
<xsl:variable name="request" as="node()*">
<!-- Request -->
<record>
<header>
<field>
<name>WARC-Type</name>
<value>request</value>
</field>
<field>
<name>WARC-Target-URI</name>
<value>
<xsl:value-of select="/archive/request/location"/>
</value>
</field>
<field>
<name>WARC-Date</name>
<value>
<!-- TODO: replace that by the archive sate -->
<xsl:value-of select="current-dateTime()"/>
</value>
</field>
<field>
<name>WARC-Record-ID</name>
<value>
<xsl:text>&lt;urn:uuid:</xsl:text>
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
<xsl:text>></xsl:text>
</value>
</field>
<field>
<name>Content-Type</name>
<value>application/http;msgtype=request</value>
</field>
</header>
<block>
<xsl:apply-templates select="/archive/request" mode="warc-http"/>
</block>
</record>
</xsl:variable>
<!-- Response -->
<xsl:variable name="response" as="node()*">
<record>
<header>
<field>
<name>WARC-Type</name>
<value>response</value>
</field>
<field>
<name>WARC-Target-URI</name>
<value>
<xsl:value-of select="/archive/request/location"/>
</value>
</field>
<field>
<name>WARC-Date</name>
<value>
<!-- TODO: replace that by the archive sate -->
<xsl:value-of select="current-dateTime()"/>
</value>
</field>
<field>
<name>WARC-Record-ID</name>
<value>
<xsl:text>&lt;urn:uuid:</xsl:text>
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
<xsl:text>></xsl:text>
</value>
</field>
<field>
<name>Content-Type</name>
<value>application/http;msgtype=response</value>
</field>
</header>
<block>
<xsl:apply-templates select="/archive/response" mode="warc-http"/>
</block>
</record>
</xsl:variable>
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
<xsl:apply-templates select="$request" mode="warc"/>
<xsl:apply-templates select="$response" mode="warc">
<xsl:with-param name="document-length" as="xs:integer" select="string-length(translate(/archive/response/document, ' &#xa;&#xd;', '')) * 3 div 4" tunnel="yes"/>
</xsl:apply-templates>
</document>
</xsl:template>
</xsl:stylesheet>
</p:input>
</p:processor>
<!-- Add the response document to finalize the response record -->
<p:processor name="oxf:file-serializer">
<p:input name="config" transform="oxf:xslt" href="#warc">
<config xsl:version="2.0">
<file>
<xsl:value-of select="substring-after(/url, 'file:')"/>
</file>
<make-directories>false</make-directories>
<append>true</append>
</config>
</p:input>
<p:input name="data" href="#document#xpointer(/archive/response/document)"/>
</p:processor>
<p:choose href="current()">
<p:when test="/archive/@href-rewritten">
<!-- Read the rewritten document -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, current())">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/root/action/@directory"/>
<xsl:value-of select="/root/archive/@href-rewritten"/>
</relpath>
<operation>read</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param">
<empty/>
</p:input>
<p:output name="data" id="rewritten" debug="rewritten"/>
</p:processor>
<!-- Store this document -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>request</scope>
</config>
</p:input>
<p:input name="data" href="#rewritten#xpointer(/document/document)"/>
<p:output name="data" id="file" debug="file"/>
</p:processor>
</p:when>
<p:otherwise>
<!-- Store a copy of the orginal version -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>request</scope>
</config>
</p:input>
<p:input name="data" href="#document#xpointer(/archive/response/document)"/>
<p:output name="data" id="file" debug="file"/>
</p:processor>
</p:otherwise>
</p:choose>
<p:processor name="oxf:identity">
<p:input name="data" href="aggregate('file', current(), #file)"/>
<p:output name="data" ref="files"/>
</p:processor>
</p:for-each>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#files" debug="files"/>
</p:processor>
<p:processor name="oxf:zip">
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc, #files)">
<files xsl:version="2.0" file-name="archive.zip">
<file name="archive.warc">
<xsl:value-of select="/root/url"/>
</file>
<xsl:for-each select="/root/files/file[url]">
<xsl:choose>
<xsl:when test="position()=1">
<!-- TODO: support non HTML documents... -->
<file name="rewritten/index.html">
<xsl:value-of select="url"/>
</file>
</xsl:when>
<xsl:otherwise>
<xsl:variable name="tokens" select="tokenize(archive/@url, '/')"/>
<xsl:variable name="last-token" select="$tokens[last()]"/>
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
<xsl:variable name="extension" select="$tokens2[last()]"/>
<file name="rewritten/{saxon:string-to-hexBinary(substring(archive/@url, 1, string-length(archive/@url) - string-length($extension) - 1), 'utf-8')}.{$extension}">
<xsl:value-of select="url"/>
</file>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</files>
</p:input>
<p:output name="data" id="zip"/>
</p:processor>
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<file>/tmp/archive.zip</file>
</config>
</p:input>
<p:input name="data" href="#zip"/>
</p:processor>
<!-- Update the queue -->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response4" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response4"/>
</p:processor>
</p:config>

View File

@ -1,387 +0,0 @@
<!--
Package an Heritrix WARC
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary" xmlns:owk="http://owark.org/orbeon/processors">
<p:param name="data" type="input"/>
<!-- Download the WARC -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/action/@warc-url"/>
</url>
<mode>binary</mode>
<authentication>
<username>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
</username>
<password>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
</password>
<preemptive>false</preemptive>
</authentication>
</config>
</p:input>
<p:output name="data" id="warc"/>
</p:processor>
<p:processor name="owk:from-warc-converter">
<p:input name="data" href="#warc"/>
<p:output name="data" id="warc-xml" debug="warc-xml"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#warc-xml"/>
</p:processor>
<!-- Download the log -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/action/@log-url"/>
</url>
<mode>text</mode>
<authentication>
<username>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
</username>
<password>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
</password>
<preemptive>false</preemptive>
</authentication>
</config>
</p:input>
<p:output name="data" id="log" debug="log"/>
</p:processor>
<!-- Store the log in a temp file -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>request</scope>
</config>
</p:input>
<p:input name="data" href="#log"/>
<p:output name="data" id="log-location" debug="log-location"/>
</p:processor>
<p:processor name="oxf:xslt">
<p:input name="data" href="#log"/>
<p:input name="config" href="parse-log.xslt"/>
<p:output name="data" id="log-xml" debug="log-xml"/>
</p:processor>
<!-- Create a resource index with links and local names -->
<p:processor name="oxf:xslt">
<p:input name="data" href="#log-xml"/>
<p:input name="config" href="resource-index.xslt"/>
<p:output name="data" id="index" debug="index"/>
</p:processor>
<!-- Loop over the WARC file to store and transform documents -->
<p:for-each href="#warc-xml" select="/warc/record[headers/header[@name='Content-Type'] = 'application/http; msgtype=response' and content/status/status = 200]" root="root" id="loop">
<p:processor name="oxf:xslt">
<p:input name="data" href="aggregate('root', current(), #index)" debug="aggregate"/>
<p:input name="config">
<resource xsl:version="2.0">
<xsl:copy-of select="/root/index/resource[uri = /root/record/headers/header[@name = 'WARC-Target-URI']]/*"/>
</resource>
</p:input>
<p:output name="data" id="index-entry" debug="index-entry"/>
</p:processor>
<p:choose href="#index-entry">
<p:when test="/resource/embeds">
<!-- The resource has embedded content and must be rewritten -->
<!-- Call the corresponding pipeline -->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#index-entry">
<config xsl:version="2.0">
<url>
<xsl:text>oxf:/actions/mediatypes/warc-</xsl:text>
<xsl:value-of select="/resource/type"/>
<xsl:text>.xpl</xsl:text>
</url>
</config>
</p:input>
<p:output name="data" id="pipeline"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="#pipeline"/>
<p:input name="record" href="current()"/>
<p:input name="index" href="#index"/>
<p:input name="index-entry" href="#index-entry"/>
<p:output name="rewritten" id="document" debug="rewritten"/>
</p:processor>
</p:when>
<p:otherwise>
<!-- The resource can be stored -->
<p:processor name="oxf:identity">
<p:input name="data" href="current()#xpointer(/record/content/document)"/>
<p:output name="data" id="document"/>
</p:processor>
</p:otherwise>
</p:choose>
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>request</scope>
</config>
</p:input>
<p:input name="data" href="#document"/>
<p:output name="data" id="doc-location" debug="doc-location"/>
</p:processor>
<p:processor name="oxf:identity">
<p:input name="data" href="aggregate('doc', #index-entry, #doc-location)"/>
<p:output name="data" ref="loop"/>
</p:processor>
</p:for-each>
<!-- Store the WARC in a temp file -->
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<scope>request</scope>
</config>
</p:input>
<p:input name="data" href="#warc"/>
<p:output name="data" id="warc-location" debug="warc-location"/>
</p:processor>
<p:processor name="oxf:zip">
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc-location, #log-location, #loop)">
<files xsl:version="2.0" file-name="archive.zip">
<file name="archive/archive.warc">
<xsl:value-of select="/root/url[1]"/>
</file>
<file name="archive/archive.log">
<xsl:value-of select="/root/url[2]"/>
</file>
<xsl:for-each select="/root/root/doc">
<file name="rewritten/{resource/local-name}">
<xsl:value-of select="url"/>
</file>
</xsl:for-each>
</files>
</p:input>
<p:output name="data" id="zip"/>
</p:processor>
<p:processor name="oxf:file-serializer">
<p:input name="config">
<config>
<file>/tmp/archive.zip</file>
</config>
</p:input>
<p:input name="data" href="#zip"/>
</p:processor>
<!-- <p:choose href="#heritrix-job">
<p:when test="/job/crawlControllerState='FINISHED'">
<!-\- The job is finished, we can get its archive... -\->
<!-\- Scan the directory to find the name of the WARC file -\->
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="#heritrix-job">
<config xsl:version="2.0">
<url>
<xsl:value-of select="/job/configFiles/value[key='warcWriter.storePaths[0]']/url"/>
</url>
<authentication>
<username>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
</username>
<password>
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
</password>
<preemptive>false</preemptive>
</authentication>
</config>
</p:input>
<p:output name="data" id="warc-dir-list" debug="warc-dir-list"/>
</p:processor>
<!-\- Next action: package -\->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list)">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/>
</parameter>
<parameter name="url" type="string">
<xsl:value-of select="/root/action/@url"/>
</parameter>
<parameter name="directory" type="string">
<xsl:value-of select="/root/action/@directory"/>
</parameter>
<parameter name="heritrix-job-url" type="string">
<xsl:value-of select="/root/action/@heritrix-job-url"/>
</parameter>
<parameter name="priority" type="string">
<xsl:value-of select="/root/action/@priority"/>
</parameter>
<parameter name="warc-url" type="string">
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url)/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
</p:when>
<p:otherwise>
<!-\- The job is not finished yet, we'll check later on... -\->
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="uuid" type="string">
<xsl:value-of select="/action/@uuid"/>
</parameter>
<parameter name="next-time" type="string">
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
</parameter>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
for $a in /queue/action where $a/@uuid = $(uuid) return
update value $a/@after with $(next-time)
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
</p:otherwise>
</p:choose>
<!-\- <p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="aggregate('root', #data, #heritrix-engine, #heritrix-unpaused)"/>
<p:input name="config">
<config xsl:version="2.0">
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
<parameter name="directory" type="string">
<xsl:value-of select="translate(/root/action/@uuid, '-', '/')"/>
<xsl:text>/</xsl:text>
</parameter>
<parameter name="uuid" type="string">
<xsl:value-of select="/root/action/@uuid"/>
</parameter>
<parameter name="url" type="string">
<xsl:value-of select="/root/action/@url"/>
</parameter>
<parameter name="priority-warc" type="string">
<xsl:value-of select="/root/action/@priority + 1"/>
</parameter>
<parameter name="next-time" type="string">
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
</parameter>
<parameter name="heritrix-job-url" type="string">
<xsl:value-of select="/root/engine/jobs/value[shortName=/root/action/@uuid]/url"/>
</parameter>
</config>
</p:input>
<p:output name="data" id="data-access-data"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
<config xsl:version="2.0">
<relpath>
<xsl:value-of select="/config/parameter[@name='directory']"/>
<xsl:text>index.xml</xsl:text>
</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param" transform="oxf:xslt" href="#data-access-data">
<archive-set xsl:version="2.0" url="{/config/parameter[@name='url']}" uuid="{/config/parameter[@name='uuid']}">
<heritrix-job url="{/config/parameter[@name='heritrix-job-url']}"/>
</archive-set>
</p:input>
<p:output name="data" id="response2" debug="response2"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response2"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="/data-access.xpl"/>
<p:input name="data" href="#data-access-data"/>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority=$(priority-warc) uuid="{util:uuid()}" type="get-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) after=$(next-time)/>
into $q,
for $a in /queue/action where $a/@uuid = $(uuid) return
update
delete $a
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>-\->
-->
</p:config>

View File

@ -1,51 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
version="2.0">
<xd:doc scope="stylesheet">
<xd:desc>
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
<xd:p>See https://webarchive.jira.com/wiki/display/Heritrix/Logs</xd:p>
</xd:desc>
</xd:doc>
<xsl:template match="/document">
<log>
<xsl:for-each select="tokenize(., '\n')[. != '']">
<entry>
<date-time>
<xsl:value-of select="substring(., 1, 24)"/>
</date-time>
<code>
<xsl:value-of select="normalize-space(substring(., 26, 5))"/>
</code>
<size>
<xsl:value-of select="normalize-space(substring(., 33, 10))"/>
</size>
<xsl:variable name="tail" select="substring(., 43)"/>
<xsl:variable name="tokens" select="tokenize($tail, ' ')"/>
<uri>
<xsl:value-of select="$tokens[1]"/>
</uri>
<discovery-path>
<xsl:value-of select="$tokens[2]"/>
</discovery-path>
<referer>
<xsl:value-of select="$tokens[3]"/>
</referer>
<content-type>
<xsl:value-of select="$tokens[4]"/>
</content-type>
<timestamp>
<xsl:value-of select="$tokens[6]"/>
</timestamp>
<sha1-digest>
<xsl:value-of select="$tokens[7]"/>
</sha1-digest>
</entry>
</xsl:for-each>
</log>
</xsl:template>
</xsl:stylesheet>

View File

@ -1,119 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:owk="http://owark.org/xslt/" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl"
exclude-result-prefixes="xs xd owk" version="2.0">
<xd:doc scope="stylesheet">
<xd:desc>
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
<xd:p>Create a resource index with links and local names from the Heritrix crawl log in XML format</xd:p>
</xd:desc>
</xd:doc>
<xsl:variable name="source" select="/"/>
<xsl:variable name="common-extensions">
<content-type name="application/x-shockwave-flash">
<extension>swf</extension>
</content-type>
<content-type name="application/xhtml+xml">
<extension>html</extension>
</content-type>
</xsl:variable>
<xsl:key name="extension" match="content-type" use="@name"/>
<xsl:function name="owk:add-extension" as="xs:string">
<xsl:param name="entry" as="element(entry)"/>
<xsl:param name="last-token" as="xs:string"/>
<xsl:sequence select="
if (contains($last-token, '.'))
then ''
else concat('.',
if (key('extension', $entry/content-type, $common-extensions))
then key('extension', $entry/content-type, $common-extensions)
else if (contains($entry/content-type, '/'))
then substring-after($entry/content-type, '/')
else 'unknown') "/>
</xsl:function>
<xsl:function name="owk:local-name" as="xs:string">
<xsl:param name="entry" as="element(entry)"/>
<xsl:variable name="is-seed" select="$entry/discovery-path='-'"/>
<xsl:variable name="tokens" select="tokenize(if (contains($entry/uri, '?')) then substring-before($entry/uri, '?') else $entry/uri, '/')"/>
<xsl:sequence
select="if ($is-seed)
then 'index.html'
else concat(
$tokens[3],
'/',
if ($tokens[last()] = '') then 'index' else $tokens[last()],
owk:add-extension($entry, $tokens[last()]))"
/>
</xsl:function>
<xsl:function name="owk:unique-local-name" as="xs:string">
<xsl:param name="entry" as="element(entry)"/>
<xsl:variable name="local-name" select="owk:local-name($entry)"/>
<xsl:sequence
select="if (count(key('entry-by-name', $local-name, $source)) = 1)
then $local-name
else concat(
substring-before($local-name, '/'),
'/',
substring-before(substring-after($local-name, '/'), '.'),
'-',
count($entry/preceding-sibling::entry[owk:local-name(.) = $local-name]) + 1,
'.',
substring-after(substring-after($local-name, '/'), '.')
)"
/>
</xsl:function>
<xsl:key name="entry-by-name" match="entry[substring-before(uri, '://') = ('http', 'https')]" use="owk:local-name(.)"/>
<xsl:template match="/log">
<index>
<xsl:apply-templates select="entry[substring-before(uri, '://') = ('http', 'https') and code = 200]"/>
</index>
</xsl:template>
<xsl:template match="entry">
<resource>
<xsl:variable name="is-seed" select="discovery-path='-'"/>
<uri seed="{$is-seed}">
<xsl:value-of select="uri"/>
</uri>
<local-name>
<xsl:value-of select="owk:unique-local-name(.)"/>
</local-name>
<type>
<xsl:choose>
<xsl:when test="content-type = 'text/html'">html</xsl:when>
<xsl:when test="content-type = 'application/xhtml+xml'">html</xsl:when>
<xsl:when test="content-type = 'text/plain'">text</xsl:when>
<xsl:otherwise>
<xsl:value-of select="substring-after(content-type, '/')"/>
</xsl:otherwise>
</xsl:choose>
</type>
<xsl:apply-templates select="." mode="redirect"/>
<xsl:apply-templates select="/log/entry[referer = current()/uri and ends-with(discovery-path, 'E')]" mode="embedding"/>
</resource>
</xsl:template>
<xsl:template match="*" mode="redirect"/>
<xsl:template match="entry[ends-with(discovery-path, 'R')]" mode="redirect">
<same-as seed="{discovery-path='-'}">
<xsl:value-of select="referer"/>
</same-as>
<xsl:apply-templates select="/log/entry[uri = current()/referer]" mode="redirect"/>
</xsl:template>
<xsl:template match="entry" mode="embedding">
<embeds>
<xsl:value-of select="uri"/>
</embeds>
</xsl:template>
</xsl:stylesheet>

View File

@ -1,103 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
version="2.0">
<xd:doc scope="stylesheet">
<xd:desc>
<xd:p><xd:b>Created on:</xd:b> Apr 13, 2012</xd:p>
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
<xd:p>Template library to produce WARC documents</xd:p>
</xd:desc>
</xd:doc>
<xsl:variable name="CRLF" select="'&#13;&#10;'"/>
<xsl:variable name="version">WARC/0.18</xsl:variable>
<xsl:template match="CRLF" name="CRLF" mode="warc">
<xsl:value-of select="$CRLF"/>
</xsl:template>
<xsl:template match="version" name="version" mode="warc">
<xsl:value-of select="$version"/>
<xsl:value-of select="$CRLF"/>
</xsl:template>
<xsl:template match="field" mode="warc">
<xsl:value-of select="name"/>
<xsl:text>: </xsl:text>
<xsl:value-of select="value"/>
<xsl:value-of select="$CRLF"/>
</xsl:template>
<xsl:template match="line" mode="warc">
<xsl:value-of select="."/>
<xsl:value-of select="$CRLF"/>
</xsl:template>
<xsl:template match="record" mode="warc">
<xsl:param name="document-length" as="xs:integer" select="0" tunnel="yes"/>
<xsl:call-template name="CRLF"/>
<xsl:call-template name="CRLF"/>
<xsl:apply-templates select="header" mode="warc"/>
<xsl:variable name="block">
<xsl:apply-templates select="block" mode="warc"/>
</xsl:variable>
<xsl:variable name="content-length">
<field>
<name>Content-Length</name>
<value>
<xsl:value-of select="string-length($block) + $document-length "/>
</value>
</field>
</xsl:variable>
<xsl:apply-templates select="$content-length" mode="warc"/>
<xsl:call-template name="CRLF"/>
<xsl:value-of select="$block"/>
</xsl:template>
<xsl:template match="block" mode="warc">
<xsl:apply-templates mode="warc"/>
</xsl:template>
<xsl:template match="header" mode="warc">
<xsl:call-template name="version"/>
<xsl:apply-templates select="*" mode="warc"/>
</xsl:template>
<xsl:template match="request" mode="warc-http">
<line>
<xsl:value-of select="method"/>
<xsl:text> </xsl:text>
<xsl:value-of select="location"/>
<xsl:text> </xsl:text>
<!-- TODO: get the HTTP version -->
<xsl:text>HTTP/1.0</xsl:text>
</line>
<xsl:apply-templates select="header" mode="warc-http"/>
</xsl:template>
<xsl:template match="response" mode="warc-http">
<!--<xsl:message>
<xsl:value-of select="string-length(document)"/>
<xsl:text> - </xsl:text>
<xsl:value-of select="string-length(translate(document, ' &#xa;&#xd;', ''))"/>
</xsl:message>-->
<line>
<!-- TODO: get the HTTP version and status-->
<xsl:text>HTTP/1.1 </xsl:text>
<xsl:value-of select="code"/>
<xsl:text> OK</xsl:text>
</line>
<xsl:apply-templates select="header" mode="warc-http"/>
</xsl:template>
<xsl:template match="header" mode="warc-http">
<field>
<name>
<xsl:value-of select="@name"/>
</name>
<value>
<xsl:value-of select="."/>
</value>
</field>
</xsl:template>
<xsl:template match="text()" mode="warc warc-http"/>
</xsl:stylesheet>

View File

@ -1,15 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<config>
<exist-root>http://admin@localhost:8080/orbeon/exist/rest/db/</exist-root>
<exist-db>owark/</exist-db>
<user-agent>Mozilla/5.0 (compatible; owark/0.3; http://owark.org/)</user-agent>
<heritrix>
<rest-api>https://localhost:8443/engine</rest-api>
<username>admin</username>
<password>envierse</password>
</heritrix>
</config>

View File

@ -1,6 +0,0 @@
<processors xmlns:owk="http://owark.org/orbeon/processors">
<processor name="owk:from-warc-converter">
<class name="org.owark.orbeon.FromWarcConverter"/>
</processor>
</processors>

View File

@ -1,160 +0,0 @@
<!--
Database access
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xforms="http://www.w3.org/2002/xforms" xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/">
<p:param name="param" type="input"/>
<!-- XQuery request or default document to read when not found -->
<p:param name="data" type="input"/>
<!-- Request description :
<config>
<relpath>Relatuve path</relpath>
<operation>read|write</operation>
<type>xquery|document</type>
<parameter></parameter>
<parameter></parameter>
</config>
-->
<p:param name="data" type="output"/>
<p:choose href="#data">
<p:when test="/config/type = 'document' and /config/operation='read'">
<p:processor name="oxf:xslt">
<p:input name="data" href="#data"/>
<p:input name="config.xml" href="oxf:/config.xml"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:template match="/">
<xsl:variable name="config" select="doc('input:config.xml')/config"/>
<xforms:submission method="get" replace="none" action="{$config/exist-root}{$config/exist-db}{/config/relpath}"/>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="submission"/>
</p:processor>
<p:processor name="oxf:xforms-submission">
<p:input name="submission" href="#submission"/>
<p:input name="request" href="#param"/>
<p:output name="response" id="document"/>
</p:processor>
<p:processor name="oxf:exception-catcher">
<p:input name="data" href="#document"/>
<p:output name="data" id="document-exception"/>
</p:processor>
<p:choose href="#document-exception">
<p:when test="/exceptions">
<p:processor name="oxf:identity">
<p:input name="data" href="#param"/>
<p:output name="data" ref="data"/>
</p:processor>
</p:when>
<p:otherwise>
<p:processor name="oxf:identity">
<p:input name="data" href="#document-exception"/>
<p:output name="data" ref="data"/>
</p:processor>
</p:otherwise>
</p:choose>
</p:when>
<p:when test="/config/type = 'document' and /config/operation='write'">
<p:processor name="oxf:xslt">
<p:input name="data" href="#data"/>
<p:input name="config.xml" href="oxf:/config.xml"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:template match="/">
<xsl:variable name="config" select="doc('input:config.xml')/config"/>
<xforms:submission method="put" replace="none" action="{$config/exist-root}{$config/exist-db}{/config/relpath}"/>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="submission"/>
</p:processor>
<p:processor name="oxf:xforms-submission">
<p:input name="submission" href="#submission"/>
<p:input name="request" href="#param"/>
<p:output name="response" ref="data"/>
</p:processor>
</p:when>
<p:when test="/config/type = 'xquery' ">
<p:processor name="oxf:unsafe-xslt">
<p:input name="data" href="#data"/>
<p:input name="config.xml" href="oxf:/config.xml"/>
<p:input name="param" href="#param"/>
<p:input name="config">
<xsl:stylesheet version="2.0">
<xsl:output name="output" method="xml" omit-xml-declaration="yes"/>
<xsl:template match="/">
<xsl:variable name="query">
<xsl:variable name="data" select="/"/>
<xsl:analyze-string select="string(doc('input:param'))" regex="\$\((\i\c*)\)" flags="">
<xsl:matching-substring>
<xsl:variable name="parameter" select="$data/config/parameter[@name = regex-group(1)]"/>
<xsl:variable name="sanitized" select="if ($parameter/@type = 'node-set') then saxon:serialize($parameter/*, 'output') else replace(replace($parameter, '&amp;', '&amp;amp;'), '''', '&amp;apos;')"/>
<xsl:choose>
<xsl:when test="not($parameter)">
<xsl:message terminate="yes">Parameter <xsl:value-of select="regex-group(1)"/> not found in query <xsl:value-of select="doc('input:param')"
/></xsl:message>
</xsl:when>
<xsl:when test="$parameter/@type='string'">
<xsl:text>'</xsl:text>
<xsl:value-of select="$sanitized"/>
<xsl:text>'</xsl:text>
</xsl:when>
<xsl:when test="$parameter/@type='node-set'">
<xsl:copy-of select="$sanitized"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$parameter/@type"/>
<xsl:text>('</xsl:text>
<xsl:value-of select="$sanitized"/>
<xsl:text>')</xsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:value-of select="."/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:variable>
<xsl:message>
<xsl:value-of select="$query"/>
</xsl:message>
<xsl:variable name="config" select="doc('input:config.xml')/config"/>
<xforms:submission method="get" replace="none"
action="{$config/exist-root}{$config/exist-db}{/config/relpath}?_howmany=10000&amp;_query={encode-for-uri(normalize-space($query))}"/>
</xsl:template>
</xsl:stylesheet>
</p:input>
<p:output name="data" id="submission"/>
</p:processor>
<p:processor name="oxf:xforms-submission">
<p:input name="submission" href="#submission"/>
<p:input name="request" href="#param"/>
<p:output name="response" ref="data"/>
</p:processor>
</p:when>
<p:otherwise>
<p:processor name="oxf:identity">
<p:input name="data">
<not-implemented/>
</p:input>
<p:output name="data" ref="data"/>
</p:processor>
</p:otherwise>
</p:choose>
</p:config>

View File

@ -1,37 +0,0 @@
<!--
Copyright (C) 2004 Orbeon, Inc.
This program is free software; you can redistribute it and/or modify it under the terms of the
GNU Lesser General Public License as published by the Free Software Foundation; either version
2.1 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Lesser General Public License for more details.
The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
-->
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/" xmlns="http://jakarta.apache.org/log4j/">
<!-- This is the standard log appender to the console (System.out) -->
<appender name="ConsoleAppender" class="org.apache.log4j.ConsoleAppender">
<param name="Target" value="System.err"/>
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{ISO8601} %-5p %c %x - %m%n"/>
</layout>
<filter class="org.apache.log4j.varia.LevelRangeFilter">
<param name="LevelMin" value="INFO"/>
</filter>
</appender>
<!-- XForms engine activity -->
<category name="org.orbeon.oxf.xforms.processor.XFormsServer">
<priority value="debug"/>
</category>
<!-- This is the root logger -->
<root>
<priority value="debug"/>
<appender-ref ref="ConsoleAppender"/>
</root>
</log4j:configuration>

View File

@ -1,65 +0,0 @@
<!--
Database creation
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:processor name="oxf:pipeline">
<p:input name="config" href="data-access.xpl"/>
<p:input name="data">
<config>
<relpath>index.xhtml</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param">
<html xml:lang="fr" >
<head>
<title>Owark DB</title>
</head>
<body>
<p>Owark db</p>
</body>
</html>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="data-access.xpl"/>
<p:input name="data">
<config>
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>document</type>
</config>
</p:input>
<p:input name="param">
<queue/>
</p:input>
<p:output name="data" id="response2" debug="response2"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response2"/>
</p:processor>
<!-- Indexes -->
<!--<p:processor name="oxf:pipeline">
<p:input name="config" href="create-indexes.xpl"/>
</p:processor>
-->
</p:config>

View File

@ -1,39 +0,0 @@
<!--
Post an archive request
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:processor name="oxf:pipeline">
<p:input name="config" href="data-access.xpl"/>
<p:input name="data">
<config>
<relpath>queue.xml</relpath>
<operation>write</operation>
<type>xquery</type>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
declare namespace util = "http://exist-db.org/xquery/util";
for $q in /queue return
update
insert <action priority="0" uuid="{util:uuid()}" type="heritrix-archive-set" url="http://xmlfr.org/"/>
into $q
]]></xquery>
</p:input>
<p:output name="data" id="response" debug="response"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response"/>
</p:processor>
</p:config>

View File

@ -1,27 +0,0 @@
<!--
Copyright (C) 2004 Orbeon, Inc.
This program is free software; you can redistribute it and/or modify it under the terms of the
GNU Lesser General Public License as published by the Free Software Foundation; either version
2.1 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Lesser General Public License for more details.
The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
-->
<properties xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:oxf="http://www.orbeon.com/oxf/processors">
<property as="xs:anyURI" name="oxf.log4j-config" value="oxf:/default-log4j.xml"/>
<property as="xs:integer" name="oxf.cache.size" value="200"/>
<property as="xs:anyURI" name="oxf.http.ssl.keystore.uri" value="file:/var/local/heritrix-3.1.0/heritrix.keystore"/>
<property as="xs:string" name="oxf.http.ssl.keystore.password" value="heritrix"/>
<property as="xs:string" name="oxf.http.ssl.hostname-verifier" value="allow-all"/>
<!--<property as="xs:NMTOKENS" name="oxf.xforms.logging.debug"
value="document model submission control event action analysis server server-body html resolver utils
submission-details submission-body"/>-->
</properties>

View File

@ -1,21 +0,0 @@
<!--
Reinstall the database
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:processor name="oxf:pipeline">
<p:input name="config" href="uninstall-db.xpl" />
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="install-db.xpl" />
</p:processor>
</p:config>

View File

@ -1,54 +0,0 @@
<!--
Scheduler
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:processor name="oxf:pipeline">
<p:input name="config" href="data-access.xpl"/>
<p:input name="data">
<config>
<relpath>queue.xml</relpath>
<operation>read</operation>
<type>xquery</type>
</config>
</p:input>
<p:input name="param">
<xquery><![CDATA[
/queue/action[not(@after) or xs:dateTime(@after) < current-dateTime()][@priority=max(/queue/action[not(@after) or xs:dateTime(@after) < current-dateTime()]/@priority)]
]]></xquery>
</p:input>
<p:output name="data" id="actions" debug="actions"/>
</p:processor>
<p:for-each href="#actions" select="/*/action">
<p:processor name="oxf:url-generator">
<p:input name="config" transform="oxf:xslt" href="current()">
<config xsl:version="2.0">
<url>
<xsl:text>oxf:/actions/</xsl:text>
<!-- Remove / and \ for security reasons -->
<xsl:value-of select="translate(/action/@type, '/\', '')"/>
<xsl:text>.xpl</xsl:text>
</url>
</config>
</p:input>
<p:output name="data" id="pipeline"/>
</p:processor>
<p:processor name="oxf:pipeline">
<p:input name="config" href="#pipeline"/>
<p:input name="data" href="current()"/>
</p:processor>
</p:for-each>
</p:config>

View File

@ -1,104 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:exsl="http://exslt.org/common" extension-element-prefixes="exsl" xmlns:xsltu="http://xsltunit.org/0/"
xmlns:owk="http://owark.org/xslt/" exclude-result-prefixes="exsl">
<xsl:import href="../actions/mediatypes/common-rewrite.xsl"/>
<xsl:import href="xsltunit.xsl"/>
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:variable name="index" select="doc('local-names.xml')/index"/>
<xsl:variable name="resource" select="$index/resource[uri='http://gmpg.org/xfn/11']"/>
<xsl:key name="log-by-uri" match="/log/entry" use="uri"/>
<xsl:template match="/">
<xsltu:tests>
<xsltu:test id="is-relative1">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'is-relative'"/>
<xsl:with-param name="nodes1">
<is-relative>true</is-relative>
</xsl:with-param>
<xsl:with-param name="nodes2">
<is-relative>
<xsl:value-of select="owk:is-relative('/foo')"/>
</is-relative>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
<xsltu:test id="is-relative2">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'is-relative'"/>
<xsl:with-param name="nodes1">
<is-relative>false</is-relative>
</xsl:with-param>
<xsl:with-param name="nodes2">
<is-relative>
<xsl:value-of select="owk:is-relative('http://example.com/foo')"/>
</is-relative>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
<xsltu:test id="safer-resolve-uri1">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'is-relative'"/>
<xsl:with-param name="nodes1">
<uri>http://example.com/foo</uri>
</xsl:with-param>
<xsl:with-param name="nodes2">
<uri>
<xsl:value-of select="owk:safer-resolve-uri('/foo', 'http://example.com/')"/>
</uri>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
<xsltu:test id="safer-resolve-uri2">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'is-relative'"/>
<xsl:with-param name="nodes1">
<uri>http://owark.org/foo</uri>
</xsl:with-param>
<xsl:with-param name="nodes2">
<uri>
<xsl:value-of select="owk:safer-resolve-uri('http://owark.org/foo', 'http://example.com/')"/>
</uri>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
<xsltu:test id="safer-resolve-uri3">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'is-relative'"/>
<xsl:with-param name="nodes1">
<uri>http://owark.org/foo{{{{}}}}</uri>
</xsl:with-param>
<xsl:with-param name="nodes2">
<uri>
<xsl:value-of select="owk:safer-resolve-uri('http://owark.org/foo{{{{}}}}', 'http://example.com/')"/>
</uri>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
<xsltu:test id="url-rewrite">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'rewrite1'"/>
<xsl:with-param name="nodes1">
<uri>http://gmpg.org/foo</uri>
</xsl:with-param>
<xsl:with-param name="nodes2">
<uri>
<xsl:value-of select="owk:url-rewrite('/foo')"/>
</uri>
</xsl:with-param>
</xsl:call-template>
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="'rewrite2'"/>
<xsl:with-param name="nodes1">
<uri>../gmpg.org/11-1.html</uri>
</xsl:with-param>
<xsl:with-param name="nodes2">
<uri>
<xsl:value-of select="owk:url-rewrite('/xfn/11')"/>
</uri>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
</xsltu:tests>
</xsl:template>
</xsl:stylesheet>

File diff suppressed because it is too large Load Diff

View File

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:exsl="http://exslt.org/common" extension-element-prefixes="exsl" xmlns:xsltu="http://xsltunit.org/0/"
xmlns:owk="http://owark.org/xslt/" exclude-result-prefixes="exsl">
<xsl:import href="../actions/resource-index.xslt"/>
<xsl:import href="xsltunit.xsl"/>
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:variable name="local-names" select="doc('local-names.xml')/index"/>
<xsl:key name="log-by-uri" match="/log/entry" use="uri"/>
<xsl:template match="/">
<xsltu:tests>
<xsl:for-each select="$local-names/resource">
<xsltu:test id="{uri}">
<xsl:call-template name="xsltu:assertEqual">
<xsl:with-param name="id" select="uri"/>
<xsl:with-param name="nodes1">
<local-name>
<xsl:value-of select="owk:unique-local-name(key('log-by-uri', current()/uri, $source ))"/>
</local-name>
</xsl:with-param>
<xsl:with-param name="nodes2">
<xsl:copy-of select="local-name"/>
</xsl:with-param>
</xsl:call-template>
</xsltu:test>
</xsl:for-each>
</xsltu:tests>
</xsl:template>
</xsl:stylesheet>

File diff suppressed because it is too large Load Diff

View File

@ -1,158 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:exsl="http://exslt.org/common"
extension-element-prefixes="exsl"
xmlns:xsltu="http://xsltunit.org/0/"
exclude-result-prefixes="exsl">
<xsl:template name="xsltu:assertEqual">
<xsl:param name="id"/>
<xsl:param name="nodes1"/>
<xsl:param name="nodes2"/>
<xsl:variable name="result">
<xsl:call-template name="xsltu:diff">
<xsl:with-param name="nodes1" select="exsl:node-set($nodes1)"/>
<xsl:with-param name="nodes2" select="exsl:node-set($nodes2)"/>
</xsl:call-template>
</xsl:variable>
<xsl:call-template name="xsltu:assert">
<xsl:with-param name="id" select="$id"/>
<xsl:with-param name="test" select="not(exsl:node-set($result)//xsltu:no-match)"/>
<xsl:with-param name="message" select="exsl:node-set($result)"/>
</xsl:call-template>
</xsl:template>
<xsl:template name="xsltu:assertNotEqual">
<xsl:param name="id"/>
<xsl:param name="nodes1"/>
<xsl:param name="nodes2"/>
<xsl:variable name="result">
<xsl:call-template name="xsltu:diff">
<xsl:with-param name="nodes1" select="exsl:node-set($nodes1)"/>
<xsl:with-param name="nodes2" select="exsl:node-set($nodes2)"/>
</xsl:call-template>
</xsl:variable>
<xsl:call-template name="xsltu:assert">
<xsl:with-param name="id" select="$id"/>
<xsl:with-param name="test" select="exsl:node-set($result)//xsltu:no-match"/>
<xsl:with-param name="message">Should have been different!</xsl:with-param>
</xsl:call-template>
</xsl:template>
<xsl:template name="xsltu:assert">
<xsl:param name="id"/>
<xsl:param name="test"/>
<xsl:param name="message"/>
<xsltu:assert id="{$id}">
<xsl:choose>
<xsl:when test="$test">
<xsl:attribute name="outcome">passed</xsl:attribute>
</xsl:when>
<xsl:otherwise>
<xsl:attribute name="outcome">failed</xsl:attribute>
<xsltu:message>
<xsl:copy-of select="$message"/>
</xsltu:message>
</xsl:otherwise>
</xsl:choose>
</xsltu:assert>
</xsl:template>
<xsl:template name="xsltu:diff">
<xsl:param name="nodes1"/>
<xsl:param name="nodes2"/>
<xsltu:diff name="{name($nodes1)}">
<xsl:choose>
<xsl:when test="self::* and (local-name($nodes1) != local-name($nodes2) or namespace-uri($nodes1) != namespace-uri($nodes2))">
<xsltu:no-match diff="names">
<xsltu:node>
<xsl:copy-of select="$nodes1"/>
</xsltu:node>
<xsltu:node>
<xsl:copy-of select="$nodes2"/>
</xsltu:node>
</xsltu:no-match>
</xsl:when>
<xsl:when test="count($nodes1/@*) != count($nodes2/@*)">
<xsltu:no-match diff="number of children attributes ({count($nodes1/@*)} versus {count($nodes2/@*)} )">
<xsltu:node>
<xsl:copy-of select="$nodes1"/>
</xsltu:node>
<xsltu:node>
<xsl:copy-of select="$nodes2"/>
</xsltu:node>
</xsltu:no-match>
</xsl:when>
<xsl:when test="count($nodes1/*) != count($nodes2/*)">
<xsltu:no-match diff="number of children elements ({count($nodes1/*)} versus {count($nodes2/*)} )">
<xsltu:node>
<xsl:copy-of select="$nodes1"/>
</xsltu:node>
<xsltu:node>
<xsl:copy-of select="$nodes2"/>
</xsltu:node>
</xsltu:no-match>
</xsl:when>
<xsl:when test="count($nodes1/text()) != count($nodes2/text())">
<xsltu:no-match diff="number of children text nodes ({count($nodes1/text())} versus {count($nodes2/text())} )">
<xsltu:node>
<xsl:copy-of select="$nodes1"/>
</xsltu:node>
<xsltu:node>
<xsl:copy-of select="$nodes2"/>
</xsltu:node>
</xsltu:no-match>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates select="$nodes1/@*" mode="xsltu:diff">
<xsl:with-param name="nodes2" select="$nodes2"/>
</xsl:apply-templates>
<xsl:apply-templates select="$nodes1/*" mode="xsltu:diff">
<xsl:with-param name="nodes2" select="$nodes2"/>
</xsl:apply-templates>
<xsl:apply-templates select="$nodes1/text()" mode="xsltu:diff">
<xsl:with-param name="nodes2" select="$nodes2"/>
</xsl:apply-templates>
</xsl:otherwise>
</xsl:choose>
</xsltu:diff>
</xsl:template>
<xsl:template match="*" mode="xsltu:diff">
<xsl:param name="pos" select="position()"/>
<xsl:param name="nodes2"/>
<xsl:param name="node2" select="$nodes2/*[position()=$pos]"/>
<xsl:call-template name="xsltu:diff">
<xsl:with-param name="nodes1" select="."/>
<xsl:with-param name="nodes2" select="$node2"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="text()" mode="xsltu:diff">
<xsl:param name="current" select="."/>
<xsl:param name="pos" select="position()"/>
<xsl:param name="nodes2"/>
<xsl:param name="node2" select="$nodes2/text()[position()=$pos]"/>
<xsl:if test="not(. = $node2)">
<xsltu:no-match>
<xsltu:node>
<xsl:copy-of select="."/>
</xsltu:node>
<xsltu:node>
<xsl:copy-of select="$node2"/>
</xsltu:node>
</xsltu:no-match>
</xsl:if>
</xsl:template>
<xsl:template match="@*" mode="xsltu:diff">
<xsl:param name="current" select="."/>
<xsl:param name="nodes2"/>
<xsl:param name="node2" select="$nodes2/@*[local-name() = local-name(current()) and namespace-uri() = namespace-uri(current())]"/>
<xsl:if test="not(. = $node2)">
<xsltu:no-match>
<xsltu:node>
<xsl:copy-of select="."/>
</xsltu:node>
<xsltu:node>
<xsl:copy-of select="$node2"/>
</xsltu:node>
</xsltu:no-match>
</xsl:if>
</xsl:template>
</xsl:stylesheet>

View File

@ -1,33 +0,0 @@
<!--
Remove the database
-->
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline"
xmlns:oxf="http://www.orbeon.com/oxf/processors"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms"
xmlns:exist="http://exist.sourceforge.net/NS/exist"
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
<p:processor name="oxf:xforms-submission">
<p:input name="submission" href="oxf:/config.xml"
transform="oxf:xslt">
<xforms:submission xsl:version="2.0" method="delete"
action="{/config/exist-root}{/config/exist-db}"
/>
</p:input>
<p:input name="request">
<empty/>
</p:input>
<p:output name="response" id="response1"/>
</p:processor>
<p:processor name="oxf:null-serializer">
<p:input name="data" href="#response1"/>
</p:processor>
</p:config>