Deleting what doesn't belong to Wordpress
This commit is contained in:
parent
10c0d87b93
commit
21807536ca
|
@ -1,45 +0,0 @@
|
|||
<project name="owark" default="dist" basedir=".">
|
||||
<description>
|
||||
Owark build file
|
||||
</description>
|
||||
<!-- set global properties for this build -->
|
||||
<property name="src" location="java/src"/>
|
||||
<property name="build" location="build"/>
|
||||
<property name="dist" location="dist"/>
|
||||
|
||||
<target name="init">
|
||||
<!-- Create the time stamp -->
|
||||
<tstamp/>
|
||||
<!-- Create the build directory structure used by compile -->
|
||||
<mkdir dir="${build}"/>
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="init"
|
||||
description="compile the source " >
|
||||
<!-- Compile the java code from ${src} into ${build} -->
|
||||
<javac srcdir="${src}" destdir="${build}">
|
||||
<classpath>
|
||||
<pathelement location="java/lib/heritrix-commons-3.1.0.jar"/>
|
||||
<pathelement location="java/lib/archive-overlay-commons-httpclient-3.1.jar"/>
|
||||
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/commons-fileupload-1.2.2.jar"/>
|
||||
<pathelement location="/home/vdv/projects/orbeon-forms/build/orbeon-war/WEB-INF/lib/orbeon.jar"/>
|
||||
</classpath>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="compile"
|
||||
description="generate the distribution" >
|
||||
<!-- Create the distribution directory -->
|
||||
<mkdir dir="${dist}/lib"/>
|
||||
|
||||
<!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
|
||||
<jar jarfile="${dist}/lib/owark.jar" basedir="${build}"/>
|
||||
</target>
|
||||
|
||||
<target name="clean"
|
||||
description="clean up" >
|
||||
<!-- Delete the ${build} and ${dist} directory trees -->
|
||||
<delete dir="${build}"/>
|
||||
<delete dir="${dist}"/>
|
||||
</target>
|
||||
</project>
|
|
@ -1,146 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.orbeon;
|
||||
|
||||
import org.apache.commons.fileupload.FileItem;
|
||||
import org.orbeon.oxf.pipeline.api.PipelineContext;
|
||||
import org.orbeon.oxf.pipeline.api.XMLReceiver;
|
||||
import org.orbeon.oxf.processor.ProcessorImpl;
|
||||
import org.orbeon.oxf.processor.ProcessorInputOutputInfo;
|
||||
import org.orbeon.oxf.processor.ProcessorOutput;
|
||||
import org.orbeon.oxf.processor.ProcessorUtils;
|
||||
import org.orbeon.oxf.processor.serializer.BinaryTextXMLReceiver;
|
||||
import org.orbeon.oxf.util.NetUtils;
|
||||
import org.orbeon.oxf.xml.ContentHandlerHelper;
|
||||
import org.orbeon.oxf.xml.XMLConstants;
|
||||
import org.orbeon.oxf.xml.XMLUtils;
|
||||
import org.owark.warc.*;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.helpers.AttributesImpl;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
/**
|
||||
* This processor converts a WARC archive into an XML representation
|
||||
*/
|
||||
|
||||
public class FromWarcConverter extends ProcessorImpl {
|
||||
|
||||
static public String WARC_ELEMENT_ROOT_NAME = "warc";
|
||||
static public String RECORD_ELEMENT_NAME = "record";
|
||||
static public String HEADERS_ELEMENT_NAME = "headers";
|
||||
static public String HEADER_ELEMENT_NAME = "header";
|
||||
static public String NAME_ATTRIBUTE_NAME = "name";
|
||||
static public String CONTENT_ELEMENT_NAME = "content";
|
||||
|
||||
public FromWarcConverter() {
|
||||
addInputInfo(new ProcessorInputOutputInfo(INPUT_DATA));
|
||||
addOutputInfo(new ProcessorInputOutputInfo(OUTPUT_DATA));
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProcessorOutput createOutput(String outputName) {
|
||||
final ProcessorOutput output = new ProcessorOutputImpl(FromWarcConverter.this,outputName) {
|
||||
|
||||
@Override
|
||||
protected void readImpl(PipelineContext pipelineContext, XMLReceiver xmlReceiver) {
|
||||
// Get FileItem
|
||||
try {
|
||||
ContentHandlerHelper helper = new ContentHandlerHelper(xmlReceiver);
|
||||
helper.startDocument();
|
||||
helper.startElement(WARC_ELEMENT_ROOT_NAME);
|
||||
final FileItem fileItem = NetUtils.prepareFileItem(NetUtils.REQUEST_SCOPE);
|
||||
// Read to OutputStream
|
||||
readInputAsSAX(pipelineContext, INPUT_DATA, new BinaryTextXMLReceiver(null, fileItem.getOutputStream(), true, false, null, false, false, null, false));
|
||||
// as an archive
|
||||
final WarcParser warcParser = new WarcParser(fileItem.getInputStream());
|
||||
while (warcParser.hasNext()) {
|
||||
helper.startElement(RECORD_ELEMENT_NAME);
|
||||
helper.startElement(HEADERS_ELEMENT_NAME);
|
||||
WarcRecord record = warcParser.next();
|
||||
WarcRecordHeader recordHeader = record.getHeader();
|
||||
while (recordHeader.hasNext()) {
|
||||
WarcField field = recordHeader.next();
|
||||
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
|
||||
helper.text(field.getValue());
|
||||
helper.endElement();
|
||||
}
|
||||
helper.endElement();
|
||||
helper.startElement(CONTENT_ELEMENT_NAME);
|
||||
WarcRecordContent content = record.getContent();
|
||||
if (content.hasRequestLine()) {
|
||||
helper.startElement("request");
|
||||
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
|
||||
helper.element("method", request.getMethod());
|
||||
helper.element("uri", request.getUri());
|
||||
helper.element("version", request.getVersion());
|
||||
helper.endElement();
|
||||
} else if (content.hasStatusLine()) {
|
||||
helper.startElement("status");
|
||||
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
|
||||
helper.element("version", status.getVersion());
|
||||
helper.element("status", status.getStatus());
|
||||
helper.element("reason", status.getReason());
|
||||
helper.endElement();
|
||||
}
|
||||
if (content.hasFields()) {
|
||||
helper.startElement(HEADERS_ELEMENT_NAME);
|
||||
while (content.hasNext()) {
|
||||
WarcField field = content.next();
|
||||
helper.startElement(HEADER_ELEMENT_NAME, new String[] {NAME_ATTRIBUTE_NAME, field.getKey()});
|
||||
helper.text(field.getValue());
|
||||
helper.endElement();
|
||||
}
|
||||
helper.endElement();
|
||||
}
|
||||
if (! content.endOfContent()) {
|
||||
helper.startPrefixMapping("xsi", "http://www.w3.org/2001/XMLSchema-instance");
|
||||
helper.startPrefixMapping("xs", "http://www.w3.org/2001/XMLSchema");
|
||||
String contentType = content.getPayloadContentType();
|
||||
AttributesImpl attributes = new AttributesImpl();
|
||||
attributes.addAttribute("", "content-type", "content-type", "CDATA", contentType);
|
||||
if (contentType.startsWith("text/") || contentType.matches(".*application/[^;]*xml.*")) {
|
||||
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:string");
|
||||
String encoding = content.getPayloadEncoding();
|
||||
if (encoding == null) {
|
||||
encoding = "utf-8";
|
||||
}
|
||||
helper.startElement(ProcessorUtils.DEFAULT_TEXT_DOCUMENT_ELEMENT, attributes);
|
||||
XMLUtils.readerToCharacters(new InputStreamReader(content, encoding), xmlReceiver);
|
||||
helper.endElement();
|
||||
} else {
|
||||
attributes.addAttribute(XMLConstants.XSI_URI, "type", "xsi:type", "CDATA", "xs:base64Binary");
|
||||
helper.startElement(ProcessorUtils.DEFAULT_BINARY_DOCUMENT_ELEMENT, attributes);
|
||||
XMLUtils.inputStreamToBase64Characters(new BufferedInputStream(content), xmlReceiver);
|
||||
helper.endElement();
|
||||
}
|
||||
}
|
||||
record.skipToEnd();
|
||||
helper.endElement();
|
||||
helper.endElement();
|
||||
}
|
||||
helper.endElement();
|
||||
helper.endDocument();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
addOutput(outputName, output);
|
||||
return output;
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 17:56:22
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcField {
|
||||
|
||||
|
||||
private String line;
|
||||
private String key;
|
||||
private String value;
|
||||
|
||||
public WarcField(String line) {
|
||||
this.line = line;
|
||||
int sep = line.indexOf(":");
|
||||
this.key = line.substring(0, sep).trim();
|
||||
this.value = line.substring(sep + 1).trim();
|
||||
}
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,123 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Read WARC files
|
||||
*/
|
||||
public class WarcParser implements Iterator<WarcRecord> {
|
||||
|
||||
public static int BUFFER_SIZE = 1024;
|
||||
public static String CRLF = "\r\n";
|
||||
public static String CRLFCRLF = CRLF + CRLF;
|
||||
public static String MAGIC = "WARC/";
|
||||
private InputStream is;
|
||||
private byte[] buffer = new byte[BUFFER_SIZE];
|
||||
private int index = 0;
|
||||
private int limit = -1;
|
||||
private String magic;
|
||||
private int recordCount;
|
||||
|
||||
|
||||
public WarcParser(InputStream is) {
|
||||
this.is = is;
|
||||
resetBuffer();
|
||||
}
|
||||
|
||||
public String getMagic() throws IOException, WarcException {
|
||||
return this.magic;
|
||||
}
|
||||
|
||||
private void resetBuffer() {
|
||||
index = 0;
|
||||
}
|
||||
|
||||
private void readUntil(String stringPattern) throws IOException, WarcException {
|
||||
boolean matches = true;
|
||||
for (int i=0; i< stringPattern.length() && limit != 0; i++) {
|
||||
int c = read();
|
||||
buffer[index ++] = (byte) c;
|
||||
if (stringPattern.codePointAt(i) != c) {
|
||||
matches = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (matches) {
|
||||
return;
|
||||
}
|
||||
readUntil(stringPattern);
|
||||
}
|
||||
|
||||
protected String readLine() throws IOException, WarcException {
|
||||
readUntil(CRLF);
|
||||
String line = new String(buffer, 0, index - CRLF.length(), "UTF-8");
|
||||
resetBuffer();
|
||||
return line;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
limit = -1;
|
||||
do {
|
||||
try {
|
||||
magic = readLine();
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
} while (! magic.startsWith(MAGIC));
|
||||
return true;
|
||||
}
|
||||
|
||||
public WarcRecord next() {
|
||||
recordCount ++;
|
||||
return new WarcRecord(this);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
}
|
||||
|
||||
public void setLimit(int limit) {
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
public boolean isLimitReached() {
|
||||
return limit == 0;
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
if (limit == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (limit > 0) {
|
||||
limit--;
|
||||
}
|
||||
int c = is.read();
|
||||
//System.out.print((char) c);
|
||||
return c;
|
||||
}
|
||||
|
||||
public int getRecordCount() {
|
||||
return recordCount;
|
||||
}
|
||||
|
||||
class WarcException extends Exception {}
|
||||
class BufferOverflowException extends WarcException {}
|
||||
class BadMagicException extends WarcException {}
|
||||
|
||||
}
|
|
@ -1,84 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 17:29:35
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcRecord {
|
||||
|
||||
private WarcParser warcParser;
|
||||
private WarcRecordHeader header;
|
||||
private WarcRecordContent content;
|
||||
|
||||
public WarcRecord(WarcParser warcParser) {
|
||||
this.warcParser = warcParser;
|
||||
}
|
||||
|
||||
public Object getMagic() throws IOException, WarcParser.WarcException {
|
||||
return warcParser.getMagic();
|
||||
}
|
||||
|
||||
public WarcRecordHeader getHeader() {
|
||||
if (header == null) {
|
||||
header = new WarcRecordHeader(this);
|
||||
}
|
||||
return header;
|
||||
}
|
||||
|
||||
public String readLine() throws IOException, WarcParser.WarcException {
|
||||
return warcParser.readLine();
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return header.getType();
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
return header.getContentType();
|
||||
}
|
||||
|
||||
public WarcRecordContent getContent() {
|
||||
if (content == null) {
|
||||
warcParser.setLimit(getContentLength());
|
||||
content = new WarcRecordContent(this);
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
public int getContentLength() {
|
||||
return header.getContentLength();
|
||||
}
|
||||
|
||||
public boolean isLimitReached() {
|
||||
return warcParser.isLimitReached();
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
return warcParser.read();
|
||||
}
|
||||
|
||||
public void skipToEnd() throws IOException {
|
||||
getHeader();
|
||||
header.skipToEnd();
|
||||
getContent();
|
||||
content.skip(getContentLength());
|
||||
}
|
||||
}
|
|
@ -1,215 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
|
||||
package org.owark.warc;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 19:00:47
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcRecordContent extends InputStream implements Iterator<WarcField> {
|
||||
|
||||
private WarcRecord warcRecord;
|
||||
private Exception e;
|
||||
private String line;
|
||||
private String payloadContentType;
|
||||
|
||||
public WarcRecordContent(WarcRecord warcRecord) {
|
||||
this.warcRecord = warcRecord;
|
||||
}
|
||||
|
||||
public boolean hasFields() {
|
||||
return warcRecord.getContentType().equals("application/warc-fields") || isHTTP();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
}
|
||||
return ! (warcRecord.isLimitReached() || line.equals(""));
|
||||
}
|
||||
|
||||
public WarcField next() {
|
||||
if (line == null) {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
}
|
||||
}
|
||||
if (line.equals("")) {
|
||||
line = null;
|
||||
return null;
|
||||
}
|
||||
WarcField field = new WarcField(line);
|
||||
if (field.getKey().equals("Content-Type")) {
|
||||
this.payloadContentType = field.getValue();
|
||||
}
|
||||
line = null;
|
||||
return field;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
//To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return warcRecord.read();
|
||||
}
|
||||
|
||||
public boolean isHTTP() {
|
||||
return warcRecord.getContentType().startsWith("application/http");
|
||||
}
|
||||
|
||||
public boolean isRequest() {
|
||||
return warcRecord.getType().equals("request");
|
||||
}
|
||||
|
||||
public HttpStatusLine getStatusLine() throws IOException, WarcParser.WarcException {
|
||||
return new HttpStatusLine(warcRecord.readLine());
|
||||
}
|
||||
|
||||
public boolean hasStatusLine() {
|
||||
return isHTTP() && ! isRequest();
|
||||
}
|
||||
|
||||
public boolean hasRequestLine() {
|
||||
return isHTTP() && isRequest();
|
||||
}
|
||||
|
||||
public boolean endOfContent() {
|
||||
return warcRecord.isLimitReached();
|
||||
}
|
||||
|
||||
public HttpRequestLine getRequestLine() throws IOException, WarcParser.WarcException {
|
||||
return new HttpRequestLine(warcRecord.readLine());
|
||||
}
|
||||
|
||||
public long getContentLength() {
|
||||
return warcRecord.getContentLength();
|
||||
}
|
||||
|
||||
public String getPayloadContentType() {
|
||||
String contentType = getPayloadContentHeader();
|
||||
if (contentType != null && contentType.contains(";")) {
|
||||
contentType = contentType.substring(0, contentType.indexOf(";"));
|
||||
}
|
||||
return contentType;
|
||||
}
|
||||
|
||||
public String getPayloadContentHeader() {
|
||||
String contentType = warcRecord.getContentType();
|
||||
if (contentType.equals("application/warc-fields") || contentType.equals("application/http; msgtype=request")) {
|
||||
return null;
|
||||
}
|
||||
if (contentType.equals("application/http; msgtype=response")) {
|
||||
contentType = this.payloadContentType;
|
||||
}
|
||||
return contentType;
|
||||
}
|
||||
|
||||
public String getPayloadEncoding() {
|
||||
String contentType = getPayloadContentHeader();
|
||||
if (contentType == null) {
|
||||
return contentType;
|
||||
}
|
||||
Pattern pattern = Pattern.compile(".*;\\s*charset\\s*=\\s*([^;]+).*");
|
||||
Matcher matcher = pattern.matcher(contentType);
|
||||
if (matcher.matches()) {
|
||||
return matcher.group(1).toLowerCase();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public class HttpStatusLine {
|
||||
|
||||
private String line;
|
||||
private String version;
|
||||
private String status;
|
||||
private String reason;
|
||||
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public String getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public String getReason() {
|
||||
return reason;
|
||||
}
|
||||
|
||||
|
||||
protected HttpStatusLine(String line) {
|
||||
this.line = line;
|
||||
String[] tokens = line.split(" ", 3);
|
||||
this.version = tokens[0];
|
||||
this.status = tokens[1];
|
||||
this.reason = tokens[2];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class HttpRequestLine {
|
||||
|
||||
private String line;
|
||||
private String version;
|
||||
private String method;
|
||||
private String uri;
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public String getMethod() {
|
||||
return method;
|
||||
}
|
||||
|
||||
public String getUri() {
|
||||
return uri;
|
||||
}
|
||||
|
||||
public HttpRequestLine(String line) {
|
||||
this.line = line;
|
||||
String[] tokens = line.split(" ", 3);
|
||||
this.method = tokens[0];
|
||||
this.uri = tokens[1];
|
||||
this.version = tokens[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import java.util.Hashtable;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: vdv
|
||||
* Date: 25 avr. 2012
|
||||
* Time: 17:50:01
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class WarcRecordHeader implements Iterator<WarcField> {
|
||||
|
||||
|
||||
public static String WARC_TYPE = "WARC-Type";
|
||||
public static String CONTENT_TYPE = "Content-Type";
|
||||
public static String CONTENT_LENGTH = "Content-Length";
|
||||
|
||||
private WarcRecord warcRecord;
|
||||
private String line;
|
||||
private Exception e;
|
||||
private Map<String,String> headers;
|
||||
private boolean endOfHeader = false;
|
||||
|
||||
|
||||
public WarcRecordHeader(WarcRecord warcRecord) {
|
||||
this.warcRecord = warcRecord;
|
||||
headers = new Hashtable<String, String>();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (endOfHeader) {
|
||||
return false;
|
||||
}
|
||||
if (line == null) {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (line.equals("")) {
|
||||
endOfHeader = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public WarcField next() {
|
||||
if (endOfHeader) {
|
||||
return null;
|
||||
}
|
||||
if (line == null) {
|
||||
try {
|
||||
line = warcRecord.readLine();
|
||||
} catch (Exception e) {
|
||||
this.e = e;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
WarcField item = new WarcField(line);
|
||||
line = null;
|
||||
headers.put(item.getKey(), item.getValue());
|
||||
return item;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return headers.get(WARC_TYPE);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
return headers.get(CONTENT_TYPE);
|
||||
}
|
||||
|
||||
public int getContentLength() {
|
||||
return Integer.parseInt(headers.get(CONTENT_LENGTH));
|
||||
}
|
||||
|
||||
public void skipToEnd() {
|
||||
while (hasNext()) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,306 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2012 Eric van der Vlist.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
* 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU Lesser General Public License for more details.
|
||||
*
|
||||
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
*/
|
||||
package org.owark.warc;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Test cases for WarcParser
|
||||
*/
|
||||
public class WarcParserTest {
|
||||
|
||||
private static WarcParser warcParser;
|
||||
|
||||
@Test
|
||||
public void testDyomedea() throws IOException, WarcParser.WarcException {
|
||||
|
||||
// WARC
|
||||
|
||||
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
|
||||
WarcParser warcParser = new WarcParser(new FileInputStream(file));
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
|
||||
// RECORD (warcinfo)
|
||||
|
||||
WarcRecord record = warcParser.next();
|
||||
Assert.assertEquals("WARC/1.0", warcParser.getMagic());
|
||||
Assert.assertNotNull(record);
|
||||
Assert.assertEquals("WARC/1.0", record.getMagic());
|
||||
|
||||
// HEADER
|
||||
|
||||
WarcRecordHeader header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
WarcField headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("warcinfo", headerItem.getValue());
|
||||
Assert.assertEquals("warcinfo", header.getType());
|
||||
Assert.assertEquals("warcinfo", record.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals("WARC-Date", headerItem.getKey());
|
||||
Assert.assertEquals("2012-04-23T10:05:24Z", headerItem.getValue());
|
||||
headerItem = header.next();
|
||||
headerItem = header.next();
|
||||
headerItem = header.next();
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals("Content-Length", headerItem.getKey());
|
||||
Assert.assertEquals("369", headerItem.getValue());
|
||||
Assert.assertEquals(false, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNull(headerItem);
|
||||
Assert.assertEquals("application/warc-fields", record.getContentType());
|
||||
Assert.assertEquals(369, record.getContentLength());
|
||||
|
||||
// Content
|
||||
|
||||
WarcRecordContent content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(true, content.hasFields());
|
||||
Assert.assertEquals(false, content.isHTTP());
|
||||
Assert.assertEquals(false, content.hasStatusLine());
|
||||
Assert.assertEquals(false, content.hasRequestLine());
|
||||
Assert.assertEquals(true, content.hasNext());
|
||||
WarcField field = content.next();
|
||||
Assert.assertEquals(false, content.endOfContent());
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("software", field.getKey());
|
||||
Assert.assertEquals("Heritrix/3.1.0 http://crawler.archive.org", field.getValue());
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("http-header-user-agent", field.getKey());
|
||||
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
|
||||
Assert.assertEquals(false, content.hasNext());
|
||||
Assert.assertNull(content.getPayloadContentType());
|
||||
Assert.assertNull(content.getPayloadContentHeader());
|
||||
Assert.assertNull(content.getPayloadEncoding());
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
|
||||
// Next record (DNS response)
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("response", headerItem.getValue());
|
||||
|
||||
header.skipToEnd();
|
||||
|
||||
// Content
|
||||
|
||||
content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(false, content.hasFields());
|
||||
Assert.assertEquals(false, content.isHTTP());
|
||||
Assert.assertEquals(false, content.hasStatusLine());
|
||||
Assert.assertEquals(false, content.hasRequestLine());
|
||||
Assert.assertEquals(false, content.endOfContent());
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
|
||||
String line = reader.readLine();
|
||||
Assert.assertEquals("20120423100524", line);
|
||||
line = reader.readLine();
|
||||
Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line);
|
||||
line = reader.readLine();
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
Assert.assertEquals("text/dns", content.getPayloadContentType());
|
||||
Assert.assertEquals("text/dns", content.getPayloadContentHeader());
|
||||
Assert.assertNull(content.getPayloadEncoding());
|
||||
Assert.assertNull(line);
|
||||
|
||||
// Next record (HTTP response)
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("response", headerItem.getValue());
|
||||
|
||||
header.skipToEnd();
|
||||
|
||||
// Content
|
||||
|
||||
content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(true, content.hasFields());
|
||||
Assert.assertEquals(true, content.isHTTP());
|
||||
Assert.assertEquals(false, content.isRequest());
|
||||
Assert.assertEquals(true, content.hasStatusLine());
|
||||
Assert.assertEquals(false, content.hasRequestLine());
|
||||
WarcRecordContent.HttpStatusLine status = content.getStatusLine();
|
||||
Assert.assertNotNull(status);
|
||||
Assert.assertEquals("HTTP/1.1 404 Introuvable", status.getLine());
|
||||
Assert.assertEquals("HTTP/1.1", status.getVersion());
|
||||
Assert.assertEquals("404", status.getStatus());
|
||||
Assert.assertEquals("Introuvable", status.getReason());
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("Date", field.getKey());
|
||||
Assert.assertEquals("Mon, 23 Apr 2012 10:05:27 GMT", field.getValue());
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("Connection", field.getKey());
|
||||
Assert.assertEquals("close", field.getValue());
|
||||
Assert.assertEquals(false, content.hasNext());
|
||||
Assert.assertEquals(false, content.endOfContent());
|
||||
reader = new BufferedReader(new InputStreamReader(content, "UTF-8"));
|
||||
line = reader.readLine();
|
||||
Assert.assertEquals("<html><head><title>Apache Tomcat/6.0.24 - Rapport d'erreur</title>", line.substring(0, line.indexOf("<style>")));
|
||||
line = reader.readLine();
|
||||
Assert.assertNull(line);
|
||||
Assert.assertEquals("text/html", content.getPayloadContentType());
|
||||
Assert.assertEquals("text/html;charset=utf-8", content.getPayloadContentHeader());
|
||||
Assert.assertEquals("utf-8", content.getPayloadEncoding());
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
|
||||
|
||||
// Next record
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("request", headerItem.getValue());
|
||||
|
||||
header.skipToEnd();
|
||||
|
||||
// Content
|
||||
|
||||
content = record.getContent();
|
||||
Assert.assertNotNull(content);
|
||||
Assert.assertEquals(true, content.hasFields());
|
||||
Assert.assertEquals(true, content.isHTTP());
|
||||
Assert.assertEquals(true, content.isRequest());
|
||||
Assert.assertEquals(false, content.hasStatusLine());
|
||||
Assert.assertEquals(true, content.hasRequestLine());
|
||||
WarcRecordContent.HttpRequestLine request = content.getRequestLine();
|
||||
Assert.assertEquals("GET /robots.txt HTTP/1.0", request.getLine());
|
||||
Assert.assertEquals("GET", request.getMethod());
|
||||
Assert.assertEquals("/robots.txt", request.getUri());
|
||||
Assert.assertEquals("HTTP/1.0", request.getVersion());
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("User-Agent", field.getKey());
|
||||
Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue());
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
field = content.next();
|
||||
Assert.assertNotNull(field);
|
||||
Assert.assertEquals("Host", field.getKey());
|
||||
Assert.assertEquals("dyomedea.com", field.getValue());
|
||||
Assert.assertEquals(false, content.hasNext());
|
||||
Assert.assertEquals(true, content.endOfContent());
|
||||
|
||||
|
||||
// Skip record
|
||||
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
record.skipToEnd();
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
record = warcParser.next();
|
||||
|
||||
// Header
|
||||
|
||||
header = record.getHeader();
|
||||
Assert.assertNotNull(header);
|
||||
Assert.assertNull(header.getType());
|
||||
Assert.assertEquals(true, header.hasNext());
|
||||
headerItem = header.next();
|
||||
Assert.assertNotNull(headerItem);
|
||||
Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey());
|
||||
Assert.assertEquals("response", headerItem.getValue());
|
||||
record.skipToEnd();
|
||||
|
||||
// Go to last record
|
||||
|
||||
while (warcParser.hasNext()) {
|
||||
record = warcParser.next();
|
||||
Assert.assertNotNull(record);
|
||||
record.skipToEnd();
|
||||
}
|
||||
|
||||
Assert.assertEquals(69, warcParser.getRecordCount());
|
||||
Assert.assertEquals("metadata", record.getType());
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void skipToEnd() throws IOException, WarcParser.WarcException {
|
||||
File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc");
|
||||
WarcParser warcParser = new WarcParser(new FileInputStream(file));
|
||||
Assert.assertEquals(true, warcParser.hasNext());
|
||||
WarcRecord record = warcParser.next();
|
||||
WarcRecordHeader header = record.getHeader();
|
||||
while (header.hasNext()) {
|
||||
Assert.assertNotNull(header.next());
|
||||
}
|
||||
WarcRecordContent content = record.getContent();
|
||||
while (content.hasNext()) {
|
||||
Assert.assertNotNull(content.next());
|
||||
}
|
||||
record.skipToEnd();
|
||||
|
||||
|
||||
}
|
||||
|
||||
} ;
|
File diff suppressed because it is too large
Load Diff
|
@ -1,11 +0,0 @@
|
|||
Pipelines in this directory are called by the scheduler.
|
||||
|
||||
Their name is the name of the corresponding action.
|
||||
|
||||
Inputs:
|
||||
|
||||
* data: the action
|
||||
|
||||
Outputs: None
|
||||
|
||||
These pipelines must take care of removing the action from the queue once they are done.
|
|
@ -1,330 +0,0 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
<!-- Look if the resource has already been archived for that set -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/action/@directory"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>read</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/action/@url"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
|
||||
boolean(//archive[@url = $(url)])
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="duplicate" debug="duplicate"/>
|
||||
</p:processor>
|
||||
|
||||
<p:choose href="#duplicate">
|
||||
|
||||
<p:when test="/*/* = 'true'">
|
||||
<!-- Already archived, nothing to do -->
|
||||
<!-- Update the queue -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response4" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response4"/>
|
||||
</p:processor>
|
||||
</p:when>
|
||||
|
||||
<p:otherwise>
|
||||
<!-- Otherwise, archive the resource... -->
|
||||
<!-- Fetch the resource -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/action/@url"/>
|
||||
</url>
|
||||
<header>
|
||||
<name>User-Agent</name>
|
||||
<value>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/user-agent"/>
|
||||
</value>
|
||||
</header>
|
||||
<mode>archive</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="archive" debug="archive"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Store the archive in the database -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/action/@directory"/>
|
||||
<xsl:value-of select="/action/@filename"/>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param" href="#archive"/>
|
||||
<p:output name="data" id="response2"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response2"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Test the type of document to see if it needs to be rewritten -->
|
||||
<p:choose href="#archive">
|
||||
|
||||
<!-- HTML document : need to update the links... -->
|
||||
<p:when test="/archive/response/document/@content-type=('text/html', 'text/css')">
|
||||
|
||||
<!-- Call the corresponding pipeline to extract the links and rewrite them -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#archive">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:text>oxf:/actions/mediatypes/</xsl:text>
|
||||
<xsl:value-of select="substring-after(/archive/response/document/@content-type, '/')"/>
|
||||
<xsl:text>.xpl</xsl:text>
|
||||
</url>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="pipeline"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="#pipeline"/>
|
||||
<p:input name="archive" href="#archive"/>
|
||||
<p:output name="rewritten" id="rewritten"/>
|
||||
<p:output name="links" id="links"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- It's a hack so that the document is not submitted as text through the xforms:submit processor... -->
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="config">
|
||||
<document xsl:version="2.0">
|
||||
<xsl:copy-of select="/"/>
|
||||
</document>
|
||||
</p:input>
|
||||
<p:input name="data" href="#rewritten"/>
|
||||
<p:output name="data" id="rewritten-embedded"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Store the rewritten document in the database -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/action/@directory"/>
|
||||
<xsl:text>rewritten-</xsl:text>
|
||||
<xsl:value-of select="/action/@filename"/>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param" href="#rewritten-embedded"/>
|
||||
<p:output name="data" id="response3"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response3"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
|
||||
<!-- Update the archive index -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/action/@directory"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="filename" type="string">
|
||||
<xsl:value-of select="/action/@filename"/>
|
||||
</parameter>
|
||||
<parameter name="filename-rewritten" type="string">
|
||||
<xsl:text>rewritten-</xsl:text>
|
||||
<xsl:value-of select="/action/@filename"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
for $as in /archive-set
|
||||
return
|
||||
update
|
||||
insert <archive url=$(url) href=$(filename) href-rewritten=$(filename-rewritten) dateTime="{current-dateTime()}"/>
|
||||
into $as
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response1"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response1"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Update the queue -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #links)">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="/root/action/@directory"/>
|
||||
</parameter>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/root/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="priority" type="string">
|
||||
<xsl:value-of select="/root/action/@priority"/>
|
||||
</parameter>
|
||||
<parameter name="links" type="node-set">
|
||||
<xsl:copy-of select="/root/links"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
declare variable $links := $(links);
|
||||
|
||||
for $q in /queue[$links/link/@abs-href]
|
||||
return
|
||||
update
|
||||
insert
|
||||
for $href in distinct-values($links/link/@abs-href)
|
||||
let $link := $links/link[@abs-href = $href][1]
|
||||
return <action priority=$(priority) uuid="{util:uuid()}" type="archive-resource" url="{$link/@abs-href}" directory=$(directory) filename="{$link/@filename}"/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response4" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response4"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:when>
|
||||
|
||||
<!-- Otherwise: no need to rewrite -->
|
||||
<p:otherwise>
|
||||
<!-- Update the archive index -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/action/@directory"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="filename" type="string">
|
||||
<xsl:value-of select="/action/@filename"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
for $as in /archive-set
|
||||
return
|
||||
update
|
||||
insert <archive url=$(url) href=$(filename) dateTime="{current-dateTime()}"/>
|
||||
into $as
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response1"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response1"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Update the queue -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response4" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response4"/>
|
||||
</p:processor>
|
||||
|
||||
</p:otherwise>
|
||||
|
||||
</p:choose>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,96 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Create a new archive
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#data"/>
|
||||
<p:input name="config">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="translate(/action/@uuid, '-', '/')"/>
|
||||
<xsl:text>/</xsl:text>
|
||||
</parameter>
|
||||
<parameter name="filename" type="string">
|
||||
<xsl:value-of select="saxon:string-to-hexBinary(/action/@url, 'utf-8')"/>
|
||||
<xsl:text>.xml</xsl:text>
|
||||
</parameter>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="priority-resource" type="string">
|
||||
<xsl:value-of select="/action/@priority + 2"/>
|
||||
</parameter>
|
||||
<parameter name="priority-package" type="string">
|
||||
<xsl:value-of select="/action/@priority + 1"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="data-access-data"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" href="#data-access-data"/>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert (<action priority=$(priority-resource) uuid="{util:uuid()}" type="archive-resource" url=$(url) directory=$(directory) filename=$(filename)/>,
|
||||
<action priority=$(priority-package) uuid="{util:uuid()}" type="package-archive" directory=$(directory)/>)
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/config/parameter[@name='directory']"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param" transform="oxf:xslt" href="#data">
|
||||
<archive-set xsl:version="2.0">
|
||||
<xsl:copy-of select="/action/@url|/action/@uuid"/>
|
||||
</archive-set>
|
||||
</p:input>
|
||||
<p:output name="data" id="response2" debug="response2"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response2"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,695 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
HERITRIX 3 CRAWL JOB CONFIGURATION FILE
|
||||
|
||||
This is a relatively minimal configuration suitable for many crawls.
|
||||
|
||||
Commented-out beans and properties are provided as an example; values
|
||||
shown in comments reflect the actual defaults which are in effect
|
||||
if not otherwise specified specification. (To change from the default
|
||||
behavior, uncomment AND alter the shown values.)
|
||||
-->
|
||||
<beans xmlns="http://www.springframework.org/schema/beans"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:context="http://www.springframework.org/schema/context"
|
||||
xmlns:aop="http://www.springframework.org/schema/aop"
|
||||
xmlns:tx="http://www.springframework.org/schema/tx"
|
||||
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
|
||||
http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd
|
||||
http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd
|
||||
http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd">
|
||||
|
||||
<context:annotation-config/>
|
||||
|
||||
<!--
|
||||
OVERRIDES
|
||||
Values elsewhere in the configuration may be replaced ('overridden')
|
||||
by a Properties map declared in a PropertiesOverrideConfigurer,
|
||||
using a dotted-bean-path to address individual bean properties.
|
||||
This allows us to collect a few of the most-often changed values
|
||||
in an easy-to-edit format here at the beginning of the model
|
||||
configuration.
|
||||
-->
|
||||
<!-- overrides from a text property list -->
|
||||
<bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
|
||||
<property name="properties">
|
||||
<value>
|
||||
# This Properties map is specified in the Java 'property list' text format
|
||||
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
|
||||
|
||||
metadata.operatorContactUrl=http://owark.org
|
||||
metadata.jobName=basic
|
||||
metadata.description=Basic crawl starting with useful defaults
|
||||
|
||||
##..more?..##
|
||||
</value>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- overrides from declared <prop> elements, more easily allowing
|
||||
multiline values or even declared beans -->
|
||||
<bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer">
|
||||
<property name="properties">
|
||||
<props>
|
||||
<prop key="seeds.textSource.value">
|
||||
|
||||
# URLS HERE
|
||||
<url xmlns=""/>
|
||||
|
||||
</prop>
|
||||
</props>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- CRAWL METADATA: including identification of crawler/operator -->
|
||||
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName">
|
||||
<property name="operatorContactUrl" value="[see override above]"/>
|
||||
<property name="jobName" value="[see override above]"/>
|
||||
<property name="description" value="[see override above]"/>
|
||||
<!-- <property name="robotsPolicyName" value="obey"/> -->
|
||||
<!-- <property name="operator" value=""/> -->
|
||||
<!-- <property name="operatorFrom" value=""/> -->
|
||||
<!-- <property name="organization" value=""/> -->
|
||||
<!-- <property name="audience" value=""/> -->
|
||||
<!-- <property name="userAgentTemplate"
|
||||
value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> -->
|
||||
|
||||
</bean>
|
||||
|
||||
<!-- SEEDS: crawl starting points
|
||||
ConfigString allows simple, inline specification of a moderate
|
||||
number of seeds; see below comment for example of using an
|
||||
arbitrarily-large external file. -->
|
||||
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
|
||||
<property name="textSource">
|
||||
<bean class="org.archive.spring.ConfigString">
|
||||
<property name="value">
|
||||
<value>
|
||||
# [see override above]
|
||||
</value>
|
||||
</property>
|
||||
</bean>
|
||||
</property>
|
||||
<!-- <property name='sourceTagSeeds' value='false'/> -->
|
||||
<!-- <property name='blockAwaitingSeedLines' value='-1'/> -->
|
||||
</bean>
|
||||
|
||||
<!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
|
||||
the job directory, similar to the H1 approach.
|
||||
Use either the above, or this, but not both. -->
|
||||
<!--
|
||||
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
|
||||
<property name="textSource">
|
||||
<bean class="org.archive.spring.ConfigFile">
|
||||
<property name="path" value="seeds.txt" />
|
||||
</bean>
|
||||
</property>
|
||||
<property name='sourceTagSeeds' value='false'/>
|
||||
<property name='blockAwaitingSeedLines' value='-1'/>
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!-- SCOPE: rules for which discovered URIs to crawl; order is very
|
||||
important because last decision returned other than 'NONE' wins. -->
|
||||
<bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence">
|
||||
<!-- <property name="logToFile" value="false" /> -->
|
||||
<property name="rules">
|
||||
<list>
|
||||
<!-- Begin by REJECTing all... -->
|
||||
<bean class="org.archive.modules.deciderules.RejectDecideRule">
|
||||
</bean>
|
||||
<!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... -->
|
||||
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
|
||||
<!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
|
||||
<!-- <property name="alsoCheckVia" value="false" /> -->
|
||||
<!-- <property name="surtsSourceFile" value="" /> -->
|
||||
<!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> -->
|
||||
<!-- <property name="surtsSource">
|
||||
<bean class="org.archive.spring.ConfigString">
|
||||
<property name="value">
|
||||
<value>
|
||||
# example.com
|
||||
# http://www.example.edu/path1/
|
||||
# +http://(org,example,
|
||||
</value>
|
||||
</property>
|
||||
</bean>
|
||||
</property> -->
|
||||
</bean>
|
||||
<!-- ...but REJECT those more than a configured link-hop-count from start... -->
|
||||
<bean class="org.archive.modules.deciderules.TooManyHopsDecideRule">
|
||||
<property name="maxHops" value="0" />
|
||||
</bean>
|
||||
<!-- ...but ACCEPT those more than a configured link-hop-count from start... -->
|
||||
<bean class="org.archive.modules.deciderules.TransclusionDecideRule">
|
||||
<!-- <property name="maxTransHops" value="2" /> -->
|
||||
<!-- <property name="maxSpeculativeHops" value="1" /> -->
|
||||
</bean>
|
||||
<!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... -->
|
||||
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
|
||||
<property name="decision" value="REJECT"/>
|
||||
<property name="seedsAsSurtPrefixes" value="false"/>
|
||||
<property name="surtsDumpFile" value="${launchId}/negative-surts.dump" />
|
||||
<!-- <property name="surtsSource">
|
||||
<bean class="org.archive.spring.ConfigFile">
|
||||
<property name="path" value="negative-surts.txt" />
|
||||
</bean>
|
||||
</property> -->
|
||||
</bean>
|
||||
<!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... -->
|
||||
<bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
|
||||
<property name="decision" value="REJECT"/>
|
||||
<!-- <property name="listLogicalOr" value="true" /> -->
|
||||
<!-- <property name="regexList">
|
||||
<list>
|
||||
</list>
|
||||
</property> -->
|
||||
</bean>
|
||||
<!-- ...and REJECT those with suspicious repeating path-segments... -->
|
||||
<bean class="org.archive.modules.deciderules.PathologicalPathDecideRule">
|
||||
<!-- <property name="maxRepetitions" value="2" /> -->
|
||||
</bean>
|
||||
<!-- ...and REJECT those with more than threshold number of path-segments... -->
|
||||
<bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
|
||||
<!-- <property name="maxPathDepth" value="20" /> -->
|
||||
</bean>
|
||||
<!-- ...but always ACCEPT those marked as prerequisitee for another URI... -->
|
||||
<bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
|
||||
</bean>
|
||||
<!-- ...but always REJECT those with unsupported URI schemes -->
|
||||
<bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
|
||||
</bean>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!--
|
||||
PROCESSING CHAINS
|
||||
Much of the crawler's work is specified by the sequential
|
||||
application of swappable Processor modules. These Processors
|
||||
are collected into three 'chains'. The CandidateChain is applied
|
||||
to URIs being considered for inclusion, before a URI is enqueued
|
||||
for collection. The FetchChain is applied to URIs when their
|
||||
turn for collection comes up. The DispositionChain is applied
|
||||
after a URI is fetched and analyzed/link-extracted.
|
||||
-->
|
||||
|
||||
<!-- CANDIDATE CHAIN -->
|
||||
<!-- first, processors are declared as top-level named beans -->
|
||||
<bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
|
||||
</bean>
|
||||
<bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer">
|
||||
<!-- <property name="preferenceDepthHops" value="-1" /> -->
|
||||
<!-- <property name="preferenceEmbedHops" value="1" /> -->
|
||||
<!-- <property name="canonicalizationPolicy">
|
||||
<ref bean="canonicalizationPolicy" />
|
||||
</property> -->
|
||||
<!-- <property name="queueAssignmentPolicy">
|
||||
<ref bean="queueAssignmentPolicy" />
|
||||
</property> -->
|
||||
<!-- <property name="uriPrecedencePolicy">
|
||||
<ref bean="uriPrecedencePolicy" />
|
||||
</property> -->
|
||||
<!-- <property name="costAssignmentPolicy">
|
||||
<ref bean="costAssignmentPolicy" />
|
||||
</property> -->
|
||||
</bean>
|
||||
<!-- now, processors are assembled into ordered CandidateChain bean -->
|
||||
<bean id="candidateProcessors" class="org.archive.modules.CandidateChain">
|
||||
<property name="processors">
|
||||
<list>
|
||||
<!-- apply scoping rules to each individual candidate URI... -->
|
||||
<ref bean="candidateScoper"/>
|
||||
<!-- ...then prepare those ACCEPTed to be enqueued to frontier. -->
|
||||
<ref bean="preparer"/>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- FETCH CHAIN -->
|
||||
<!-- first, processors are declared as top-level named beans -->
|
||||
<bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
|
||||
<!-- <property name="recheckScope" value="false" /> -->
|
||||
<!-- <property name="blockAll" value="false" /> -->
|
||||
<!-- <property name="blockByRegex" value="" /> -->
|
||||
<!-- <property name="allowByRegex" value="" /> -->
|
||||
</bean>
|
||||
<bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer">
|
||||
<!-- <property name="ipValidityDurationSeconds" value="21600" /> -->
|
||||
<!-- <property name="robotsValidityDurationSeconds" value="86400" /> -->
|
||||
<!-- <property name="calculateRobotsOnly" value="false" /> -->
|
||||
</bean>
|
||||
<bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS">
|
||||
<!-- <property name="acceptNonDnsResolves" value="false" /> -->
|
||||
<!-- <property name="digestContent" value="true" /> -->
|
||||
<!-- <property name="digestAlgorithm" value="sha1" /> -->
|
||||
</bean>
|
||||
<bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois">
|
||||
<property name="specialQueryTemplates">
|
||||
<map>
|
||||
<entry key="whois.verisign-grs.com" value="domain %s" />
|
||||
<entry key="whois.arin.net" value="z + %s" />
|
||||
<entry key="whois.denic.de" value="-T dn %s" />
|
||||
</map>
|
||||
</property>
|
||||
</bean>
|
||||
<bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP">
|
||||
<!-- <property name="useHTTP11" value="false" /> -->
|
||||
<!-- <property name="maxLengthBytes" value="0" /> -->
|
||||
<!-- <property name="timeoutSeconds" value="1200" /> -->
|
||||
<!-- <property name="maxFetchKBSec" value="0" /> -->
|
||||
<!-- <property name="defaultEncoding" value="ISO-8859-1" /> -->
|
||||
<!-- <property name="shouldFetchBodyRule">
|
||||
<bean class="org.archive.modules.deciderules.AcceptDecideRule"/>
|
||||
</property> -->
|
||||
<!-- <property name="soTimeoutMs" value="20000" /> -->
|
||||
<!-- <property name="sendIfModifiedSince" value="true" /> -->
|
||||
<!-- <property name="sendIfNoneMatch" value="true" /> -->
|
||||
<!-- <property name="sendConnectionClose" value="true" /> -->
|
||||
<!-- <property name="sendReferer" value="true" /> -->
|
||||
<!-- <property name="sendRange" value="false" /> -->
|
||||
<!-- <property name="ignoreCookies" value="false" /> -->
|
||||
<!-- <property name="sslTrustLevel" value="OPEN" /> -->
|
||||
<!-- <property name="acceptHeaders">
|
||||
<list>
|
||||
<value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
|
||||
</list>
|
||||
</property>
|
||||
-->
|
||||
<!-- <property name="httpBindAddress" value="" /> -->
|
||||
<!-- <property name="httpProxyHost" value="" /> -->
|
||||
<!-- <property name="httpProxyPort" value="0" /> -->
|
||||
<!-- <property name="httpProxyUser" value="" /> -->
|
||||
<!-- <property name="httpProxyPassword" value="" /> -->
|
||||
<!-- <property name="digestContent" value="true" /> -->
|
||||
<!-- <property name="digestAlgorithm" value="sha1" /> -->
|
||||
</bean>
|
||||
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
|
||||
</bean>
|
||||
<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
|
||||
<!-- <property name="extractJavascript" value="true" /> -->
|
||||
<!-- <property name="extractValueAttributes" value="true" /> -->
|
||||
<!-- <property name="ignoreFormActionUrls" value="false" /> -->
|
||||
<!-- <property name="extractOnlyFormGets" value="true" /> -->
|
||||
<!-- <property name="treatFramesAsEmbedLinks" value="true" /> -->
|
||||
<!-- <property name="ignoreUnexpectedHtml" value="true" /> -->
|
||||
<!-- <property name="maxElementLength" value="1024" /> -->
|
||||
<!-- <property name="maxAttributeNameLength" value="1024" /> -->
|
||||
<!-- <property name="maxAttributeValueLength" value="16384" /> -->
|
||||
</bean>
|
||||
<bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS">
|
||||
</bean>
|
||||
<bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS">
|
||||
</bean>
|
||||
<bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF">
|
||||
</bean>
|
||||
<!-- now, processors are assembled into ordered FetchChain bean -->
|
||||
<bean id="fetchProcessors" class="org.archive.modules.FetchChain">
|
||||
<property name="processors">
|
||||
<list>
|
||||
<!-- re-check scope, if so enabled... -->
|
||||
<ref bean="preselector"/>
|
||||
<!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... -->
|
||||
<ref bean="preconditions"/>
|
||||
<!-- ...fetch if DNS URI... -->
|
||||
<ref bean="fetchDns"/>
|
||||
<ref bean="fetchWhois"/>
|
||||
<!-- ...fetch if HTTP URI... -->
|
||||
<ref bean="fetchHttp"/>
|
||||
<!-- ...extract outlinks from HTTP headers... -->
|
||||
<ref bean="extractorHttp"/>
|
||||
<!-- ...extract outlinks from HTML content... -->
|
||||
<ref bean="extractorHtml"/>
|
||||
<!-- ...extract outlinks from CSS content... -->
|
||||
<ref bean="extractorCss"/>
|
||||
<!-- ...extract outlinks from Javascript content... -->
|
||||
<ref bean="extractorJs"/>
|
||||
<!-- ...extract outlinks from Flash content... -->
|
||||
<ref bean="extractorSwf"/>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- DISPOSITION CHAIN -->
|
||||
<!-- first, processors are declared as top-level named beans -->
|
||||
<bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor">
|
||||
<property name="compress" value="false" />
|
||||
<!-- <property name="prefix" value="IAH" /> -->
|
||||
<!-- <property name="suffix" value="${HOSTNAME}" /> -->
|
||||
<!-- <property name="maxFileSizeBytes" value="1000000000" /> -->
|
||||
<!-- <property name="poolMaxActive" value="1" /> -->
|
||||
<!-- <property name="MaxWaitForIdleMs" value="500" /> -->
|
||||
<!-- <property name="skipIdenticalDigests" value="false" /> -->
|
||||
<!-- <property name="maxTotalBytesToWrite" value="0" /> -->
|
||||
<!-- <property name="directory" value="${launchId}" /> -->
|
||||
<!-- <property name="storePaths">
|
||||
<list>
|
||||
<value>warcs</value>
|
||||
</list>
|
||||
</property> -->
|
||||
<!-- <property name="writeRequests" value="true" /> -->
|
||||
<!-- <property name="writeMetadata" value="true" /> -->
|
||||
<!-- <property name="writeRevisitForIdenticalDigests" value="true" /> -->
|
||||
<!-- <property name="writeRevisitForNotModified" value="true" /> -->
|
||||
</bean>
|
||||
<bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor">
|
||||
<!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
|
||||
</bean>
|
||||
<bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor">
|
||||
<!-- <property name="delayFactor" value="5.0" /> -->
|
||||
<!-- <property name="minDelayMs" value="3000" /> -->
|
||||
<!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> -->
|
||||
<!-- <property name="maxDelayMs" value="30000" /> -->
|
||||
<!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
|
||||
</bean>
|
||||
<!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor">
|
||||
<property name="rescheduleDelaySeconds" value="-1" />
|
||||
</bean> -->
|
||||
<!-- now, processors are assembled into ordered DispositionChain bean -->
|
||||
<bean id="dispositionProcessors" class="org.archive.modules.DispositionChain">
|
||||
<property name="processors">
|
||||
<list>
|
||||
<!-- write to aggregate archival files... -->
|
||||
<ref bean="warcWriter"/>
|
||||
<!-- ...send each outlink candidate URI to CandidateChain,
|
||||
and enqueue those ACCEPTed to the frontier... -->
|
||||
<ref bean="candidates"/>
|
||||
<!-- ...then update stats, shared-structures, frontier decisions -->
|
||||
<ref bean="disposition"/>
|
||||
<!-- <ref bean="rescheduler" /> -->
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- CRAWLCONTROLLER: Control interface, unifying context -->
|
||||
<bean id="crawlController"
|
||||
class="org.archive.crawler.framework.CrawlController">
|
||||
<!-- <property name="maxToeThreads" value="25" /> -->
|
||||
<!-- <property name="pauseAtStart" value="true" /> -->
|
||||
<!-- <property name="runWhileEmpty" value="false" /> -->
|
||||
<!-- <property name="recorderInBufferBytes" value="524288" /> -->
|
||||
<!-- <property name="recorderOutBufferBytes" value="16384" /> -->
|
||||
<!-- <property name="scratchDir" value="scratch" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- FRONTIER: Record of all URIs discovered and queued-for-collection -->
|
||||
<bean id="frontier"
|
||||
class="org.archive.crawler.frontier.BdbFrontier">
|
||||
<!-- <property name="queueTotalBudget" value="-1" /> -->
|
||||
<!-- <property name="balanceReplenishAmount" value="3000" /> -->
|
||||
<!-- <property name="errorPenaltyAmount" value="100" /> -->
|
||||
<!-- <property name="precedenceFloor" value="255" /> -->
|
||||
<!-- <property name="queuePrecedencePolicy">
|
||||
<bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" />
|
||||
</property> -->
|
||||
<!-- <property name="snoozeLongMs" value="300000" /> -->
|
||||
<!-- <property name="retryDelaySeconds" value="900" /> -->
|
||||
<!-- <property name="maxRetries" value="30" /> -->
|
||||
<!-- <property name="recoveryLogEnabled" value="true" /> -->
|
||||
<!-- <property name="maxOutlinks" value="6000" /> -->
|
||||
<!-- <property name="extractIndependently" value="false" /> -->
|
||||
<!-- <property name="outbound">
|
||||
<bean class="java.util.concurrent.ArrayBlockingQueue">
|
||||
<constructor-arg value="200"/>
|
||||
<constructor-arg value="true"/>
|
||||
</bean>
|
||||
</property> -->
|
||||
<!-- <property name="inbound">
|
||||
<bean class="java.util.concurrent.ArrayBlockingQueue">
|
||||
<constructor-arg value="40000"/>
|
||||
<constructor-arg value="true"/>
|
||||
</bean>
|
||||
</property> -->
|
||||
<!-- <property name="dumpPendingAtClose" value="false" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs -->
|
||||
<bean id="uriUniqFilter"
|
||||
class="org.archive.crawler.util.BdbUriUniqFilter">
|
||||
</bean>
|
||||
|
||||
<!--
|
||||
EXAMPLE SETTINGS OVERLAY SHEETS
|
||||
Sheets allow some settings to vary by context - usually by URI context,
|
||||
so that different sites or sections of sites can be treated differently.
|
||||
Here are some example Sheets for common purposes. The SheetOverlaysManager
|
||||
(below) automatically collects all Sheet instances declared among the
|
||||
original beans, but others can be added during the crawl via the scripting
|
||||
interface.
|
||||
-->
|
||||
|
||||
<!-- forceRetire: any URI to which this sheet's settings are applied
|
||||
will force its containing queue to 'retired' status. -->
|
||||
<bean id='forceRetire' class='org.archive.spring.Sheet'>
|
||||
<property name='map'>
|
||||
<map>
|
||||
<entry key='disposition.forceRetire' value='true'/>
|
||||
</map>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- smallBudget: any URI to which this sheet's settings are applied
|
||||
will give its containing queue small values for balanceReplenishAmount
|
||||
(causing it to have shorter 'active' periods while other queues are
|
||||
waiting) and queueTotalBudget (causing the queue to enter 'retired'
|
||||
status once that expenditure is reached by URI attempts and errors) -->
|
||||
<bean id='smallBudget' class='org.archive.spring.Sheet'>
|
||||
<property name='map'>
|
||||
<map>
|
||||
<entry key='frontier.balanceReplenishAmount' value='20'/>
|
||||
<entry key='frontier.queueTotalBudget' value='100'/>
|
||||
</map>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- veryPolite: any URI to which this sheet's settings are applied
|
||||
will cause its queue to take extra-long politeness snoozes -->
|
||||
<bean id='veryPolite' class='org.archive.spring.Sheet'>
|
||||
<property name='map'>
|
||||
<map>
|
||||
<entry key='disposition.delayFactor' value='10'/>
|
||||
<entry key='disposition.minDelayMs' value='10000'/>
|
||||
<entry key='disposition.maxDelayMs' value='1000000'/>
|
||||
<entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/>
|
||||
</map>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- highPrecedence: any URI to which this sheet's settings are applied
|
||||
will give its containing queue a slightly-higher than default
|
||||
queue precedence value. That queue will then be preferred over
|
||||
other queues for active crawling, never waiting behind lower-
|
||||
precedence queues. -->
|
||||
<bean id='highPrecedence' class='org.archive.spring.Sheet'>
|
||||
<property name='map'>
|
||||
<map>
|
||||
<entry key='frontier.balanceReplenishAmount' value='20'/>
|
||||
<entry key='frontier.queueTotalBudget' value='100'/>
|
||||
</map>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!--
|
||||
EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
|
||||
A SheetAssociation says certain URIs should have certain overlay Sheets
|
||||
applied. This example applies two sheets to URIs matching two SURT-prefixes.
|
||||
New associations may also be added mid-crawl using the scripting facility.
|
||||
-->
|
||||
|
||||
<!--
|
||||
<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>
|
||||
<property name='surtPrefixes'>
|
||||
<list>
|
||||
<value>http://(org,example,</value>
|
||||
<value>http://(com,example,www,)/</value>
|
||||
</list>
|
||||
</property>
|
||||
<property name='targetSheetNames'>
|
||||
<list>
|
||||
<value>veryPolite</value>
|
||||
<value>smallBudget</value>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!--
|
||||
OPTIONAL BUT RECOMMENDED BEANS
|
||||
-->
|
||||
|
||||
<!-- ACTIONDIRECTORY: disk directory for mid-crawl operations
|
||||
Running job will watch directory for new files with URIs,
|
||||
scripts, and other data to be processed during a crawl. -->
|
||||
<bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">
|
||||
<!-- <property name="actionDir" value="action" /> -->
|
||||
<!-- <property name="doneDir" value="${launchId}/actions-done" /> -->
|
||||
<!-- <property name="initialDelaySeconds" value="10" /> -->
|
||||
<!-- <property name="delaySeconds" value="30" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits -->
|
||||
<bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer">
|
||||
<!-- <property name="maxBytesDownload" value="0" /> -->
|
||||
<!-- <property name="maxDocumentsDownload" value="0" /> -->
|
||||
<!-- <property name="maxTimeSeconds" value="0" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- CHECKPOINTSERVICE: checkpointing assistance -->
|
||||
<bean id="checkpointService"
|
||||
class="org.archive.crawler.framework.CheckpointService">
|
||||
<!-- <property name="checkpointIntervalMinutes" value="-1"/> -->
|
||||
<!-- <property name="checkpointsDir" value="checkpoints"/> -->
|
||||
</bean>
|
||||
|
||||
<!--
|
||||
OPTIONAL BEANS
|
||||
Uncomment and expand as needed, or if non-default alternate
|
||||
implementations are preferred.
|
||||
-->
|
||||
|
||||
<!-- CANONICALIZATION POLICY -->
|
||||
<!--
|
||||
<bean id="canonicalizationPolicy"
|
||||
class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy">
|
||||
<property name="rules">
|
||||
<list>
|
||||
<bean class="org.archive.modules.canonicalize.LowercaseRule" />
|
||||
<bean class="org.archive.modules.canonicalize.StripUserinfoRule" />
|
||||
<bean class="org.archive.modules.canonicalize.StripWWWNRule" />
|
||||
<bean class="org.archive.modules.canonicalize.StripSessionIDs" />
|
||||
<bean class="org.archive.modules.canonicalize.StripSessionCFIDs" />
|
||||
<bean class="org.archive.modules.canonicalize.FixupQueryString" />
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
-->
|
||||
|
||||
|
||||
<!-- QUEUE ASSIGNMENT POLICY -->
|
||||
<!--
|
||||
<bean id="queueAssignmentPolicy"
|
||||
class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy">
|
||||
<property name="forceQueueAssignment" value="" />
|
||||
<property name="deferToPrevious" value="true" />
|
||||
<property name="parallelQueues" value="1" />
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!-- URI PRECEDENCE POLICY -->
|
||||
<!--
|
||||
<bean id="uriPrecedencePolicy"
|
||||
class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy">
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!-- COST ASSIGNMENT POLICY -->
|
||||
<!--
|
||||
<bean id="costAssignmentPolicy"
|
||||
class="org.archive.crawler.frontier.UnitCostAssignmentPolicy">
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials -->
|
||||
<!--
|
||||
<bean id="credentialStore"
|
||||
class="org.archive.modules.credential.CredentialStore">
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!-- DISK SPACE MONITOR:
|
||||
Pauses the crawl if disk space at monitored paths falls below minimum threshold -->
|
||||
<!--
|
||||
<bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor">
|
||||
<property name="pauseThresholdMiB" value="500" />
|
||||
<property name="monitorConfigPaths" value="true" />
|
||||
<property name="monitorPaths">
|
||||
<list>
|
||||
<value>PATH</value>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
-->
|
||||
|
||||
<!--
|
||||
REQUIRED STANDARD BEANS
|
||||
It will be very rare to replace or reconfigure the following beans.
|
||||
-->
|
||||
|
||||
<!-- STATISTICSTRACKER: standard stats/reporting collector -->
|
||||
<bean id="statisticsTracker"
|
||||
class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName">
|
||||
<!-- <property name="reports">
|
||||
<list>
|
||||
<bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" />
|
||||
<bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" />
|
||||
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" />
|
||||
<bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" />
|
||||
<bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" />
|
||||
<bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" />
|
||||
<bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" />
|
||||
<bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" />
|
||||
<bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" />
|
||||
<bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" />
|
||||
</list>
|
||||
</property> -->
|
||||
<!-- <property name="reportsDir" value="${launchId}/reports" /> -->
|
||||
<!-- <property name="liveHostReportSize" value="20" /> -->
|
||||
<!-- <property name="intervalSeconds" value="20" /> -->
|
||||
<!-- <property name="keepSnapshotsCount" value="5" /> -->
|
||||
<!-- <property name="liveHostReportSize" value="20" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- CRAWLERLOGGERMODULE: shared logging facility -->
|
||||
<bean id="loggerModule"
|
||||
class="org.archive.crawler.reporting.CrawlerLoggerModule">
|
||||
<!-- <property name="path" value="${launchId}/logs" /> -->
|
||||
<!-- <property name="crawlLogPath" value="crawl.log" /> -->
|
||||
<!-- <property name="alertsLogPath" value="alerts.log" /> -->
|
||||
<!-- <property name="progressLogPath" value="progress-statistics.log" /> -->
|
||||
<!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> -->
|
||||
<!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> -->
|
||||
<!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> -->
|
||||
<!-- <property name="logExtraInfo" value="false" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
|
||||
Autowired to include any SheetForSurtPrefix or
|
||||
SheetForDecideRuled beans -->
|
||||
<bean id="sheetOverlaysManager" autowire="byType"
|
||||
class="org.archive.crawler.spring.SheetOverlaysManager">
|
||||
</bean>
|
||||
|
||||
<!-- BDBMODULE: shared BDB-JE disk persistence manager -->
|
||||
<bean id="bdb"
|
||||
class="org.archive.bdb.BdbModule">
|
||||
<!-- <property name="dir" value="state" /> -->
|
||||
<!-- <property name="cachePercent" value="60" /> -->
|
||||
<!-- <property name="useSharedCache" value="true" /> -->
|
||||
<!-- <property name="expectedConcurrency" value="25" /> -->
|
||||
</bean>
|
||||
|
||||
<!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
|
||||
<bean id="cookieStorage"
|
||||
class="org.archive.modules.fetcher.BdbCookieStorage">
|
||||
<!-- <property name="cookiesLoadFile"><null/></property> -->
|
||||
<!-- <property name="cookiesSaveFile"><null/></property> -->
|
||||
<!-- <property name="bdb">
|
||||
<ref bean="bdb"/>
|
||||
</property> -->
|
||||
</bean>
|
||||
|
||||
<!-- SERVERCACHE: shared cache of server/host info -->
|
||||
<bean id="serverCache"
|
||||
class="org.archive.modules.net.BdbServerCache">
|
||||
<!-- <property name="bdb">
|
||||
<ref bean="bdb"/>
|
||||
</property> -->
|
||||
</bean>
|
||||
|
||||
<!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative
|
||||
to crawler-beans.cxml file, and tracking crawl files for web UI -->
|
||||
<bean id="configPathConfigurer"
|
||||
class="org.archive.spring.ConfigPathConfigurer">
|
||||
</bean>
|
||||
|
||||
</beans>
|
|
@ -1,20 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0">
|
||||
|
||||
<xsl:variable name="action" select="/action"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<xsl:apply-templates select="doc('crawler-beans-template.cxml')/*"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="@* | node()">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@* | node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="url">
|
||||
<xsl:value-of select="$action/@url"/>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
|
@ -1,223 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Check if a job is terminated and get its WARC archive
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
|
||||
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
|
||||
<!-- Get the job -->
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" transform="oxf:xslt" href="#data">
|
||||
<xforms:submission xsl:version="2.0" method="get" action="{/action/@heritrix-job-url}" xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}"
|
||||
xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
|
||||
<xforms:header combine="replace">
|
||||
<xforms:name>Accept</xforms:name>
|
||||
<xforms:value>application/xml</xforms:value>
|
||||
</xforms:header>
|
||||
</xforms:submission>
|
||||
</p:input>
|
||||
<p:input name="request">
|
||||
<instance/>
|
||||
</p:input>
|
||||
<p:output name="response" id="heritrix-job" debug="heritrix-job"/>
|
||||
</p:processor>
|
||||
|
||||
<p:choose href="#heritrix-job">
|
||||
<p:when test="/job/crawlControllerState='FINISHED'">
|
||||
<!-- The job is finished, we can get its archive... -->
|
||||
<!-- Scan the directory to find the name of the WARC file -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#heritrix-job">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/job/configFiles/value[key='warcWriter.storePaths[0]']/url"/>
|
||||
</url>
|
||||
<authentication>
|
||||
<username>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
|
||||
</username>
|
||||
<password>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
|
||||
</password>
|
||||
<preemptive>false</preemptive>
|
||||
</authentication>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="warc-dir-list" debug="warc-dir-list"/>
|
||||
</p:processor>
|
||||
<!-- Next action: package -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/root/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/root/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="/root/action/@directory"/>
|
||||
</parameter>
|
||||
<parameter name="heritrix-job-url" type="string">
|
||||
<xsl:value-of select="/root/action/@heritrix-job-url"/>
|
||||
</parameter>
|
||||
<parameter name="priority" type="string">
|
||||
<xsl:value-of select="/root/action/@priority"/>
|
||||
</parameter>
|
||||
<parameter name="warc-url" type="string">
|
||||
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
|
||||
</parameter>
|
||||
<parameter name="log-url" type="string">
|
||||
<xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
</p:when>
|
||||
<p:otherwise>
|
||||
<!-- The job is not finished yet, we'll check later on... -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="next-time" type="string">
|
||||
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update value $a/@after with $(next-time)
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
|
||||
|
||||
<!-- <p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="aggregate('root', #data, #heritrix-engine, #heritrix-unpaused)"/>
|
||||
<p:input name="config">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="translate(/root/action/@uuid, '-', '/')"/>
|
||||
<xsl:text>/</xsl:text>
|
||||
</parameter>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/root/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/root/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="priority-warc" type="string">
|
||||
<xsl:value-of select="/root/action/@priority + 1"/>
|
||||
</parameter>
|
||||
<parameter name="next-time" type="string">
|
||||
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
|
||||
</parameter>
|
||||
<parameter name="heritrix-job-url" type="string">
|
||||
<xsl:value-of select="/root/engine/jobs/value[shortName=/root/action/@uuid]/url"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="data-access-data"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/config/parameter[@name='directory']"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param" transform="oxf:xslt" href="#data-access-data">
|
||||
<archive-set xsl:version="2.0" url="{/config/parameter[@name='url']}" uuid="{/config/parameter[@name='uuid']}">
|
||||
<heritrix-job url="{/config/parameter[@name='heritrix-job-url']}"/>
|
||||
</archive-set>
|
||||
</p:input>
|
||||
<p:output name="data" id="response2" debug="response2"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response2"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" href="#data-access-data"/>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority=$(priority-warc) uuid="{util:uuid()}" type="get-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) after=$(next-time)/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>-->
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,196 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Create a new archive through Heritrix
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
|
||||
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
|
||||
<!-- Create a new Heritrix job-->
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" transform="oxf:xslt" href="oxf:/config.xml">
|
||||
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/config/heritrix/rest-api}" xxforms:username="{/config/heritrix/username}"
|
||||
xxforms:password="{/config/heritrix/password}" xxforms:preemptive-authentication="false">
|
||||
<xforms:header combine="replace">
|
||||
<xforms:name>Accept</xforms:name>
|
||||
<xforms:value>application/xml</xforms:value>
|
||||
</xforms:header>
|
||||
</xforms:submission>
|
||||
</p:input>
|
||||
<p:input name="request" transform="oxf:xslt" href="#data">
|
||||
<instance xsl:version="2.0">
|
||||
<action>create</action>
|
||||
<createpath>
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</createpath>
|
||||
</instance>
|
||||
</p:input>
|
||||
<p:output name="response" id="heritrix-engine" debug="heritrix-engine"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Create a job configuration -->
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#data"/>
|
||||
<p:input name="config" href="cxml.xslt"/>
|
||||
<p:output name="data" id="cxml"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Upload the job configuration -->
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine)">
|
||||
<xforms:submission xsl:version="2.0" method="put" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/primaryConfigUrl}"
|
||||
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')//config/heritrix/password}" xxforms:preemptive-authentication="false"/>
|
||||
</p:input>
|
||||
<p:input name="request" href="#cxml"/>
|
||||
<p:output name="response" id="cxml-response" debug="cxml-response"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Build the job -->
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #cxml-response)">
|
||||
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
|
||||
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
|
||||
<xforms:header combine="replace">
|
||||
<xforms:name>Accept</xforms:name>
|
||||
<xforms:value>application/xml</xforms:value>
|
||||
</xforms:header>
|
||||
</xforms:submission>
|
||||
</p:input>
|
||||
<p:input name="request" transform="oxf:xslt" href="#data">
|
||||
<instance xsl:version="2.0">
|
||||
<action>build</action>
|
||||
</instance>
|
||||
</p:input>
|
||||
<p:output name="response" id="heritrix-built" debug="heritrix-built"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Launch the job -->
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #heritrix-built)">
|
||||
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
|
||||
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
|
||||
<xforms:header combine="replace">
|
||||
<xforms:name>Accept</xforms:name>
|
||||
<xforms:value>application/xml</xforms:value>
|
||||
</xforms:header>
|
||||
</xforms:submission>
|
||||
</p:input>
|
||||
<p:input name="request" transform="oxf:xslt" href="#data">
|
||||
<instance xsl:version="2.0">
|
||||
<action>launch</action>
|
||||
</instance>
|
||||
</p:input>
|
||||
<p:output name="response" id="heritrix-launched" debug="heritrix-launched"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Unpause the job -->
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" transform="oxf:xslt" href="aggregate('root', #data, #heritrix-engine, #heritrix-launched)">
|
||||
<xforms:submission xsl:version="2.0" method="urlencoded-post" action="{/root/engine/jobs/value[shortName=/root/action/@uuid]/url}"
|
||||
xxforms:username="{doc('oxf:/config.xml')/config/heritrix/username}" xxforms:password="{doc('oxf:/config.xml')/config/heritrix/password}" xxforms:preemptive-authentication="false">
|
||||
<xforms:header combine="replace">
|
||||
<xforms:name>Accept</xforms:name>
|
||||
<xforms:value>application/xml</xforms:value>
|
||||
</xforms:header>
|
||||
</xforms:submission>
|
||||
</p:input>
|
||||
<p:input name="request" transform="oxf:xslt" href="#data">
|
||||
<instance xsl:version="2.0">
|
||||
<action>unpause</action>
|
||||
</instance>
|
||||
</p:input>
|
||||
<p:output name="response" id="heritrix-unpaused" debug="heritrix-unpaused"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="aggregate('root', #data, #heritrix-engine, #heritrix-unpaused)"/>
|
||||
<p:input name="config">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="translate(/root/action/@uuid, '-', '/')"/>
|
||||
<xsl:text>/</xsl:text>
|
||||
</parameter>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/root/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/root/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="priority-warc" type="string">
|
||||
<xsl:value-of select="/root/action/@priority + 1"/>
|
||||
</parameter>
|
||||
<parameter name="next-time" type="string">
|
||||
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
|
||||
</parameter>
|
||||
<parameter name="heritrix-job-url" type="string">
|
||||
<xsl:value-of select="/root/engine/jobs/value[shortName=/root/action/@uuid]/url"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="data-access-data"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/config/parameter[@name='directory']"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param" transform="oxf:xslt" href="#data-access-data">
|
||||
<archive-set xsl:version="2.0" url="{/config/parameter[@name='url']}" uuid="{/config/parameter[@name='uuid']}">
|
||||
<heritrix-job url="{/config/parameter[@name='heritrix-job-url']}"/>
|
||||
</archive-set>
|
||||
</p:input>
|
||||
<p:output name="data" id="response2" debug="response2"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response2"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" href="#data-access-data"/>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority=$(priority-warc) uuid="{util:uuid()}" type="get-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) after=$(next-time)/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,13 +0,0 @@
|
|||
Pipelines in this directory are called by the archive-resource pipeline.
|
||||
|
||||
Their name is the name of the media subtype.
|
||||
|
||||
Inputs:
|
||||
|
||||
* archive: the archive
|
||||
|
||||
Outputs:
|
||||
|
||||
* rewritten: the rewritten version of the document
|
||||
* links: the list of rewritten links
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" xmlns:owk="http://owark.org/xslt/"
|
||||
exclude-result-prefixes="xs xd owk" version="2.0">
|
||||
<xd:doc scope="stylesheet">
|
||||
<xd:desc>
|
||||
<xd:p><xd:b>Created on:</xd:b> May 4, 2012</xd:p>
|
||||
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||
<xd:p>Common functions and template for URL rewriting</xd:p>
|
||||
</xd:desc>
|
||||
</xd:doc>
|
||||
<xsl:function name="owk:is-relative" as="xs:boolean">
|
||||
<xsl:param name="url" as="xs:string"/>
|
||||
<xsl:sequence select="not(substring-before($url, ':') = ('http', 'https'))"/>
|
||||
</xsl:function>
|
||||
<xsl:function name="owk:safer-resolve-uri" as="xs:string">
|
||||
<xsl:param name="relative" as="xs:string"/>
|
||||
<xsl:param name="hbase" as="xs:string"/>
|
||||
<xsl:sequence select="if (owk:is-relative($relative)) then resolve-uri($relative, $hbase) else $relative"/>
|
||||
</xsl:function>
|
||||
<xsl:function name="owk:url-rewrite" as="xs:string">
|
||||
<xsl:param name="url" as="xs:string"/>
|
||||
<xsl:variable name="no-fragment" select="substring-before(concat($url, '#'), '#')"/>
|
||||
<xsl:variable name="abs" select="owk:safer-resolve-uri($no-fragment, $base) cast as xs:string"/>
|
||||
<xsl:variable name="local-name" select="$index/resource[(for $u in (uri, same-as) return $u cast as xs:string) = $abs][1]/local-name"/>
|
||||
<xsl:message>local-name: <xsl:value-of select="$local-name"/></xsl:message>
|
||||
<xsl:sequence select="if ($local-name) then concat(if ($resource/uri/@seed = 'false') then '../' else '', $local-name) else owk:safer-resolve-uri($url, $base)"/>
|
||||
</xsl:function>
|
||||
|
||||
<xsl:variable name="index" select="doc('input:index')/*"/>
|
||||
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
|
||||
<xsl:variable name="base" select="$resource/uri"/>
|
||||
</xsl:stylesheet>
|
|
@ -1,106 +0,0 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="archive" type="input"/>
|
||||
<p:param name="rewritten" type="output"/>
|
||||
<p:param name="links" type="output"/>
|
||||
|
||||
|
||||
<!-- Store the document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>session</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#archive#xpointer(/archive/response/document)"/>
|
||||
<p:output name="data" id="url-written"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- And read it as CSS -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#url-written">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/*"/>
|
||||
</url>
|
||||
<content-type>text/css</content-type>
|
||||
<mode>text</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="css" debug="css"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Get a list of links to update -->
|
||||
<!-- TODO: support links in inline CSS -->
|
||||
<!-- TODO: support iframes and objects -->
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#css"/>
|
||||
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="base" select="doc('input:request')/request/location"/>
|
||||
<xsl:template match="/">
|
||||
<links>
|
||||
<xsl:variable name="links" as="node()*">
|
||||
<xsl:analyze-string select="document" regex="url\(["']?([^)'"]+)["']?\)" flags="">
|
||||
<xsl:matching-substring>
|
||||
<link href="{regex-group(1)}"/>
|
||||
</xsl:matching-substring>
|
||||
</xsl:analyze-string>
|
||||
</xsl:variable>
|
||||
<xsl:for-each-group select="$links" group-by="@href">
|
||||
<xsl:variable name="abs-href" select="resolve-uri(@href, $base)"/>
|
||||
<xsl:variable name="tokens" select="tokenize($abs-href, '/')"/>
|
||||
<xsl:variable name="last-token" select="$tokens[last()]"/>
|
||||
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
|
||||
<xsl:variable name="extension" select="$tokens2[last()]"/>
|
||||
<link abs-href="{$abs-href}" new-href="{saxon:string-to-hexBinary(substring($abs-href, 1, string-length($abs-href) - string-length($extension) - 1), 'utf-8')}.{$extension}"
|
||||
filename="{saxon:string-to-hexBinary($abs-href, 'utf-8')}.xml">
|
||||
<xsl:copy-of select="@*"/>
|
||||
</link>
|
||||
</xsl:for-each-group>
|
||||
</links>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="links-local" debug="links"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="#links-local"/>
|
||||
<p:output name="data" ref="links"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Update the links -->
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#css"/>
|
||||
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
|
||||
<p:input name="links" href="#links-local"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="links" select="doc('input:links')/links"/>
|
||||
<xsl:variable name="base" select="doc('input:request')/request/location"/>
|
||||
<xsl:key name="link" match="link" use="@href"/>
|
||||
<xsl:template match="/document">
|
||||
<xsl:copy>
|
||||
<xsl:copy-of select="@*"/>
|
||||
<xsl:analyze-string select="." regex="url\(["']?([^)'"]+)["']?\)" flags="">
|
||||
<xsl:matching-substring>
|
||||
<xsl:text>url(</xsl:text>
|
||||
<xsl:value-of select="$links/key('link', regex-group(1))/@new-href"/>
|
||||
<xsl:text>)</xsl:text>
|
||||
</xsl:matching-substring>
|
||||
<xsl:non-matching-substring>
|
||||
<xsl:copy-of select="."/>
|
||||
</xsl:non-matching-substring>
|
||||
</xsl:analyze-string>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" ref="rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,123 +0,0 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="archive" type="input"/>
|
||||
<p:param name="rewritten" type="output"/>
|
||||
<p:param name="links" type="output"/>
|
||||
|
||||
|
||||
<!-- Store the document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>session</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#archive#xpointer(/archive/response/document)"/>
|
||||
<p:output name="data" id="url-written"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- And read it as HTML -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#url-written">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/*"/>
|
||||
</url>
|
||||
<mode>html</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="html" debug="html"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Get a list of links to update -->
|
||||
<!-- TODO: support links in inline CSS -->
|
||||
<!-- TODO: support iframes and objects -->
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#html"/>
|
||||
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="base" select="doc('input:request')/request/location"/>
|
||||
<xsl:template match="/">
|
||||
<links>
|
||||
<xsl:variable name="links" as="node()*">
|
||||
<xsl:apply-templates/>
|
||||
</xsl:variable>
|
||||
<xsl:for-each-group select="$links" group-by="@href">
|
||||
<xsl:variable name="abs-href" select="resolve-uri(@href, $base)"/>
|
||||
<xsl:variable name="tokens" select="tokenize($abs-href, '/')"/>
|
||||
<xsl:variable name="last-token" select="$tokens[last()]"/>
|
||||
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
|
||||
<xsl:variable name="extension" select="$tokens2[last()]"/>
|
||||
<link abs-href="{$abs-href}" new-href="{saxon:string-to-hexBinary(substring($abs-href, 1, string-length($abs-href) - string-length($extension) - 1), 'utf-8')}.{$extension}"
|
||||
filename="{saxon:string-to-hexBinary($abs-href, 'utf-8')}.xml">
|
||||
<xsl:copy-of select="@*"/>
|
||||
</link>
|
||||
</xsl:for-each-group>
|
||||
</links>
|
||||
</xsl:template>
|
||||
<xsl:template match="text()"/>
|
||||
<xsl:template match="link[@rel='stylesheet']">
|
||||
<link>
|
||||
<xsl:copy-of select="@*"/>
|
||||
</link>
|
||||
</xsl:template>
|
||||
<xsl:template match="img">
|
||||
<link href="{@src}" type="image/*"/>
|
||||
</xsl:template>
|
||||
<xsl:template match="script[@src]">
|
||||
<link href="{@src}" type="{@type}"/>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="links-local" debug="links"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="#links-local"/>
|
||||
<p:output name="data" ref="links"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Update the links -->
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#html"/>
|
||||
<p:input name="request" href="#archive#xpointer(/archive/request)"/>
|
||||
<p:input name="links" href="#links-local"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="links" select="doc('input:links')/links"/>
|
||||
<xsl:variable name="base" select="doc('input:request')/request/location"/>
|
||||
<xsl:key name="link" match="link" use="@href"/>
|
||||
<xsl:template match="@*|node()">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
<xsl:template match="link[@rel='stylesheet']/@href|img/@src|script/@src">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:value-of select="$links/key('link', current())/@new-href"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
<xsl:template match="link[@rel!='stylesheet']/@href|a/@href">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:value-of select="resolve-uri(., $base)"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="html-rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:xml-converter">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<content-type>application/xml</content-type>
|
||||
<encoding>utf-8</encoding>
|
||||
<version>1.0</version>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#html-rewritten"/>
|
||||
<p:output name="data" ref="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
</p:config>
|
|
@ -1,69 +0,0 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="record" type="input"/>
|
||||
<p:param name="index-entry" type="input"/>
|
||||
<p:param name="index" type="input"/>
|
||||
<p:param name="rewritten" type="output"/>
|
||||
|
||||
|
||||
<!-- Store the document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>session</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
|
||||
<p:output name="data" id="url-written"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- And read it as CSS -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#url-written">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/*"/>
|
||||
</url>
|
||||
<content-type>text/css</content-type>
|
||||
<mode>text</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="css" debug="css"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Update the links -->
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#css"/>
|
||||
<p:input name="index-entry" href="#index-entry"/>
|
||||
<p:input name="index" href="#index"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="index" select="doc('input:index')/*"/>
|
||||
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
|
||||
<xsl:variable name="base" select="$resource/uri"/>
|
||||
<xsl:template match="/document">
|
||||
<xsl:copy>
|
||||
<xsl:copy-of select="@*"/>
|
||||
<xsl:analyze-string select="." regex="url\(["']?([^)'"]+)["']?\)" flags="">
|
||||
<xsl:matching-substring>
|
||||
<xsl:text>url(</xsl:text>
|
||||
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(regex-group(1), $base), '#'), '#')"/>
|
||||
<xsl:variable name="local-name" select="$index/resource[(uri, same-as) = $abs][1]/local-name"/>
|
||||
<xsl:value-of select="if ($local-name) then concat('../', $local-name) else resolve-uri(regex-group(1), $base)"/>
|
||||
<xsl:text>)</xsl:text>
|
||||
</xsl:matching-substring>
|
||||
<xsl:non-matching-substring>
|
||||
<xsl:copy-of select="."/>
|
||||
</xsl:non-matching-substring>
|
||||
</xsl:analyze-string>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" ref="rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,109 +0,0 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="record" type="input"/>
|
||||
<p:param name="index-entry" type="input"/>
|
||||
<p:param name="index" type="input"/>
|
||||
<p:param name="rewritten" type="output"/>
|
||||
|
||||
<!-- Try to guess the encoding... -->
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#record"/>
|
||||
<p:input name="config">
|
||||
<encoding xsl:version="2.0">
|
||||
<xsl:choose>
|
||||
<xsl:when test="contains(/record/content/headers/header[@name='Content-Type'], 'charset=')">
|
||||
<xsl:value-of select="substring-before(concat(substring-after(/record/content/headers/header[@name='Content-Type'], 'charset='), ';'), ';')"/>
|
||||
<xsl:message>
|
||||
ENCODING :
|
||||
<xsl:value-of select="substring-before(concat(substring-after(/record/content/headers/header[@name='Content-Type'], 'charset='), ';'), ';')"/>
|
||||
</xsl:message>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>utf-8</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</encoding>
|
||||
</p:input>
|
||||
<p:output name="data" id="encoding" debug="encoding"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Store the document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config" transform="oxf:xslt" href="#encoding">
|
||||
<config xsl:version="2.0">
|
||||
<scope>session</scope>
|
||||
<encoding>
|
||||
<xsl:value-of select="/encoding"/>
|
||||
</encoding>
|
||||
<force-encoding>true</force-encoding>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#record#xpointer(/record/content/document)"/>
|
||||
<p:output name="data" id="url-written" debug="url-written"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- And read it as HTML -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="aggregate('root', #url-written, #encoding)">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/root/url"/>
|
||||
</url>
|
||||
<encoding>
|
||||
<xsl:value-of select="/root/encoding"/>
|
||||
</encoding>
|
||||
<force-encoding>true</force-encoding>
|
||||
<content-type>text/html</content-type>
|
||||
<force-content-type>true</force-content-type>
|
||||
<mode>html</mode>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="html" debug="html"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Update the links -->
|
||||
<!-- TODO: support links in inline CSS -->
|
||||
<!-- TODO: support iframes and objects -->
|
||||
|
||||
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#html"/>
|
||||
<p:input name="index-entry" href="#index-entry"/>
|
||||
<p:input name="index" href="#index"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:variable name="index" select="doc('input:index')/*"/>
|
||||
<xsl:variable name="resource" select="doc('input:index-entry')/resource"/>
|
||||
<xsl:variable name="base" select="$resource/uri"/>
|
||||
<xsl:template match="@*|node()">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
<xsl:template match="link[@rel='stylesheet']/@href|img/@src|script/@src|embed/@src|@background">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:variable name="abs" select="substring-before(concat(resolve-uri(., $base), '#'), '#')"/>
|
||||
<xsl:variable name="local-name" select="$index/resource[(uri, same-as) = $abs][1]/local-name"/>
|
||||
<xsl:value-of select="if ($local-name) then concat(if ($resource/uri/@seed = 'false') then '../' else '', $local-name) else resolve-uri(., $base)"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
<xsl:template match="link[@rel!='stylesheet']/@href|a/@href">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:value-of select="resolve-uri(., $base)"/>
|
||||
</xsl:attribute>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="html-rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:html-converter">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<content-type>text/html</content-type>
|
||||
<encoding>utf-8</encoding>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#html-rewritten"/>
|
||||
<p:output name="data" ref="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
</p:config>
|
|
@ -1,366 +0,0 @@
|
|||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
<!-- Read the archive index -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/action/@directory"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>read</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<empty/>
|
||||
</p:input>
|
||||
<p:output name="data" id="index" debug="index"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Create a WARC file -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>request</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #index, #data)">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:import href="warc-lib.xsl"/>
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="content" as="node()*">
|
||||
<record>
|
||||
<header>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>warcinfo</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/warc-fields</value>
|
||||
</field>
|
||||
</header>
|
||||
<block>
|
||||
<field>
|
||||
<name>software</name>
|
||||
<value>Owark 0.3 http://owark.org</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>format</name>
|
||||
<value>WARC file version 0.18</value>
|
||||
</field>
|
||||
</block>
|
||||
</record>
|
||||
<!--
|
||||
|
||||
|
||||
software: Heritrix 1.12.0 http://crawler.archive.org
|
||||
hostname: crawling017.archive.org
|
||||
ip: 207.241.227.234
|
||||
isPartOf: testcrawl-20050708
|
||||
description: testcrawl with WARC output
|
||||
operator: IA_Admin
|
||||
http-header-user-agent:
|
||||
Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org)
|
||||
format: WARC file version 0.18
|
||||
conformsTo:
|
||||
http://www.archive.org/documents/WarcFileFormat-0.18.html-->
|
||||
</xsl:variable>
|
||||
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
|
||||
<xsl:apply-templates select="$content" mode="warc"/>
|
||||
</document>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="warc" debug="warc"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Loop over the index to retrieve the documents -->
|
||||
|
||||
<p:for-each href="#index" select="/archive-set/archive" id="files" root="files">
|
||||
|
||||
<!-- Read the document -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, current())">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/root/action/@directory"/>
|
||||
<xsl:value-of select="/root/archive/@href"/>
|
||||
</relpath>
|
||||
<operation>read</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<empty/>
|
||||
</p:input>
|
||||
<p:output name="data" id="document" debug="document"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Add the request and start of response records -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config" transform="oxf:xslt" href="#warc">
|
||||
<config xsl:version="2.0">
|
||||
<file>
|
||||
<xsl:value-of select="substring-after(/url, 'file:')"/>
|
||||
</file>
|
||||
<make-directories>false</make-directories>
|
||||
<append>true</append>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" transform="oxf:xslt" href="#document">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:import href="warc-lib.xsl"/>
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="request" as="node()*">
|
||||
<!-- Request -->
|
||||
<record>
|
||||
<header>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>request</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Target-URI</name>
|
||||
<value>
|
||||
<xsl:value-of select="/archive/request/location"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<!-- TODO: replace that by the archive sate -->
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/http;msgtype=request</value>
|
||||
</field>
|
||||
</header>
|
||||
<block>
|
||||
<xsl:apply-templates select="/archive/request" mode="warc-http"/>
|
||||
</block>
|
||||
</record>
|
||||
</xsl:variable>
|
||||
<!-- Response -->
|
||||
<xsl:variable name="response" as="node()*">
|
||||
<record>
|
||||
<header>
|
||||
<field>
|
||||
<name>WARC-Type</name>
|
||||
<value>response</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Target-URI</name>
|
||||
<value>
|
||||
<xsl:value-of select="/archive/request/location"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Date</name>
|
||||
<value>
|
||||
<!-- TODO: replace that by the archive sate -->
|
||||
<xsl:value-of select="current-dateTime()"/>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>WARC-Record-ID</name>
|
||||
<value>
|
||||
<xsl:text><urn:uuid:</xsl:text>
|
||||
<xsl:value-of select="translate(substring(/root/action/@directory, 1, string-length(/root/action/@directory) - 1), '/', '-')"/>
|
||||
<xsl:text>></xsl:text>
|
||||
</value>
|
||||
</field>
|
||||
<field>
|
||||
<name>Content-Type</name>
|
||||
<value>application/http;msgtype=response</value>
|
||||
</field>
|
||||
</header>
|
||||
<block>
|
||||
<xsl:apply-templates select="/archive/response" mode="warc-http"/>
|
||||
</block>
|
||||
</record>
|
||||
</xsl:variable>
|
||||
<document xsl:version="2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xs:string" content-type="text/plain">
|
||||
<xsl:apply-templates select="$request" mode="warc"/>
|
||||
<xsl:apply-templates select="$response" mode="warc">
|
||||
<xsl:with-param name="document-length" as="xs:integer" select="string-length(translate(/archive/response/document, ' 

', '')) * 3 div 4" tunnel="yes"/>
|
||||
</xsl:apply-templates>
|
||||
</document>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
</p:processor>
|
||||
|
||||
<!-- Add the response document to finalize the response record -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config" transform="oxf:xslt" href="#warc">
|
||||
<config xsl:version="2.0">
|
||||
<file>
|
||||
<xsl:value-of select="substring-after(/url, 'file:')"/>
|
||||
</file>
|
||||
<make-directories>false</make-directories>
|
||||
<append>true</append>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#document#xpointer(/archive/response/document)"/>
|
||||
</p:processor>
|
||||
|
||||
<p:choose href="current()">
|
||||
<p:when test="/archive/@href-rewritten">
|
||||
<!-- Read the rewritten document -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, current())">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/root/action/@directory"/>
|
||||
<xsl:value-of select="/root/archive/@href-rewritten"/>
|
||||
</relpath>
|
||||
<operation>read</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<empty/>
|
||||
</p:input>
|
||||
<p:output name="data" id="rewritten" debug="rewritten"/>
|
||||
</p:processor>
|
||||
<!-- Store this document -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>request</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#rewritten#xpointer(/document/document)"/>
|
||||
<p:output name="data" id="file" debug="file"/>
|
||||
</p:processor>
|
||||
</p:when>
|
||||
<p:otherwise>
|
||||
<!-- Store a copy of the orginal version -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>request</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#document#xpointer(/archive/response/document)"/>
|
||||
<p:output name="data" id="file" debug="file"/>
|
||||
</p:processor>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
|
||||
|
||||
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="aggregate('file', current(), #file)"/>
|
||||
<p:output name="data" ref="files"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:for-each>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#files" debug="files"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:zip">
|
||||
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc, #files)">
|
||||
<files xsl:version="2.0" file-name="archive.zip">
|
||||
<file name="archive.warc">
|
||||
<xsl:value-of select="/root/url"/>
|
||||
</file>
|
||||
<xsl:for-each select="/root/files/file[url]">
|
||||
<xsl:choose>
|
||||
<xsl:when test="position()=1">
|
||||
<!-- TODO: support non HTML documents... -->
|
||||
<file name="rewritten/index.html">
|
||||
<xsl:value-of select="url"/>
|
||||
</file>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:variable name="tokens" select="tokenize(archive/@url, '/')"/>
|
||||
<xsl:variable name="last-token" select="$tokens[last()]"/>
|
||||
<xsl:variable name="tokens2" select="tokenize($last-token, '\.')"/>
|
||||
<xsl:variable name="extension" select="$tokens2[last()]"/>
|
||||
<file name="rewritten/{saxon:string-to-hexBinary(substring(archive/@url, 1, string-length(archive/@url) - string-length($extension) - 1), 'utf-8')}.{$extension}">
|
||||
<xsl:value-of select="url"/>
|
||||
</file>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:for-each>
|
||||
</files>
|
||||
</p:input>
|
||||
<p:output name="data" id="zip"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<file>/tmp/archive.zip</file>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#zip"/>
|
||||
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Update the queue -->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response4" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response4"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,387 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Package an Heritrix WARC
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/"
|
||||
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary" xmlns:owk="http://owark.org/orbeon/processors">
|
||||
|
||||
<p:param name="data" type="input"/>
|
||||
|
||||
|
||||
<!-- Download the WARC -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/action/@warc-url"/>
|
||||
</url>
|
||||
<mode>binary</mode>
|
||||
<authentication>
|
||||
<username>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
|
||||
</username>
|
||||
<password>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
|
||||
</password>
|
||||
<preemptive>false</preemptive>
|
||||
</authentication>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="warc"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="owk:from-warc-converter">
|
||||
<p:input name="data" href="#warc"/>
|
||||
<p:output name="data" id="warc-xml" debug="warc-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#warc-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Download the log -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/action/@log-url"/>
|
||||
</url>
|
||||
<mode>text</mode>
|
||||
<authentication>
|
||||
<username>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
|
||||
</username>
|
||||
<password>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
|
||||
</password>
|
||||
<preemptive>false</preemptive>
|
||||
</authentication>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="log" debug="log"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Store the log in a temp file -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>request</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#log"/>
|
||||
<p:output name="data" id="log-location" debug="log-location"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#log"/>
|
||||
<p:input name="config" href="parse-log.xslt"/>
|
||||
<p:output name="data" id="log-xml" debug="log-xml"/>
|
||||
</p:processor>
|
||||
|
||||
<!-- Create a resource index with links and local names -->
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#log-xml"/>
|
||||
<p:input name="config" href="resource-index.xslt"/>
|
||||
<p:output name="data" id="index" debug="index"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
|
||||
|
||||
<!-- Loop over the WARC file to store and transform documents -->
|
||||
<p:for-each href="#warc-xml" select="/warc/record[headers/header[@name='Content-Type'] = 'application/http; msgtype=response' and content/status/status = 200]" root="root" id="loop">
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="aggregate('root', current(), #index)" debug="aggregate"/>
|
||||
<p:input name="config">
|
||||
<resource xsl:version="2.0">
|
||||
<xsl:copy-of select="/root/index/resource[uri = /root/record/headers/header[@name = 'WARC-Target-URI']]/*"/>
|
||||
</resource>
|
||||
</p:input>
|
||||
<p:output name="data" id="index-entry" debug="index-entry"/>
|
||||
</p:processor>
|
||||
<p:choose href="#index-entry">
|
||||
<p:when test="/resource/embeds">
|
||||
<!-- The resource has embedded content and must be rewritten -->
|
||||
|
||||
<!-- Call the corresponding pipeline -->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#index-entry">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:text>oxf:/actions/mediatypes/warc-</xsl:text>
|
||||
<xsl:value-of select="/resource/type"/>
|
||||
<xsl:text>.xpl</xsl:text>
|
||||
</url>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="pipeline"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="#pipeline"/>
|
||||
<p:input name="record" href="current()"/>
|
||||
<p:input name="index" href="#index"/>
|
||||
<p:input name="index-entry" href="#index-entry"/>
|
||||
<p:output name="rewritten" id="document" debug="rewritten"/>
|
||||
</p:processor>
|
||||
|
||||
</p:when>
|
||||
<p:otherwise>
|
||||
<!-- The resource can be stored -->
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="current()#xpointer(/record/content/document)"/>
|
||||
<p:output name="data" id="document"/>
|
||||
</p:processor>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>request</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#document"/>
|
||||
<p:output name="data" id="doc-location" debug="doc-location"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="aggregate('doc', #index-entry, #doc-location)"/>
|
||||
<p:output name="data" ref="loop"/>
|
||||
</p:processor>
|
||||
</p:for-each>
|
||||
|
||||
|
||||
|
||||
<!-- Store the WARC in a temp file -->
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<scope>request</scope>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#warc"/>
|
||||
<p:output name="data" id="warc-location" debug="warc-location"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:zip">
|
||||
<p:input name="data" transform="oxf:unsafe-xslt" href="aggregate('root', #warc-location, #log-location, #loop)">
|
||||
<files xsl:version="2.0" file-name="archive.zip">
|
||||
<file name="archive/archive.warc">
|
||||
<xsl:value-of select="/root/url[1]"/>
|
||||
</file>
|
||||
<file name="archive/archive.log">
|
||||
<xsl:value-of select="/root/url[2]"/>
|
||||
</file>
|
||||
<xsl:for-each select="/root/root/doc">
|
||||
<file name="rewritten/{resource/local-name}">
|
||||
<xsl:value-of select="url"/>
|
||||
</file>
|
||||
</xsl:for-each>
|
||||
</files>
|
||||
</p:input>
|
||||
<p:output name="data" id="zip"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:file-serializer">
|
||||
<p:input name="config">
|
||||
<config>
|
||||
<file>/tmp/archive.zip</file>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="data" href="#zip"/>
|
||||
|
||||
</p:processor>
|
||||
|
||||
<!-- <p:choose href="#heritrix-job">
|
||||
<p:when test="/job/crawlControllerState='FINISHED'">
|
||||
<!-\- The job is finished, we can get its archive... -\->
|
||||
<!-\- Scan the directory to find the name of the WARC file -\->
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="#heritrix-job">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:value-of select="/job/configFiles/value[key='warcWriter.storePaths[0]']/url"/>
|
||||
</url>
|
||||
<authentication>
|
||||
<username>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/>
|
||||
</username>
|
||||
<password>
|
||||
<xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/>
|
||||
</password>
|
||||
<preemptive>false</preemptive>
|
||||
</authentication>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="warc-dir-list" debug="warc-dir-list"/>
|
||||
</p:processor>
|
||||
<!-\- Next action: package -\->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list)">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/root/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/root/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="/root/action/@directory"/>
|
||||
</parameter>
|
||||
<parameter name="heritrix-job-url" type="string">
|
||||
<xsl:value-of select="/root/action/@heritrix-job-url"/>
|
||||
</parameter>
|
||||
<parameter name="priority" type="string">
|
||||
<xsl:value-of select="/root/action/@priority"/>
|
||||
</parameter>
|
||||
<parameter name="warc-url" type="string">
|
||||
<xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url)/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
</p:when>
|
||||
<p:otherwise>
|
||||
<!-\- The job is not finished yet, we'll check later on... -\->
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="next-time" type="string">
|
||||
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update value $a/@after with $(next-time)
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
|
||||
|
||||
<!-\- <p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="aggregate('root', #data, #heritrix-engine, #heritrix-unpaused)"/>
|
||||
<p:input name="config">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
<parameter name="directory" type="string">
|
||||
<xsl:value-of select="translate(/root/action/@uuid, '-', '/')"/>
|
||||
<xsl:text>/</xsl:text>
|
||||
</parameter>
|
||||
<parameter name="uuid" type="string">
|
||||
<xsl:value-of select="/root/action/@uuid"/>
|
||||
</parameter>
|
||||
<parameter name="url" type="string">
|
||||
<xsl:value-of select="/root/action/@url"/>
|
||||
</parameter>
|
||||
<parameter name="priority-warc" type="string">
|
||||
<xsl:value-of select="/root/action/@priority + 1"/>
|
||||
</parameter>
|
||||
<parameter name="next-time" type="string">
|
||||
<xsl:value-of select="current-dateTime() + xs:dayTimeDuration('PT1M')"/>
|
||||
</parameter>
|
||||
<parameter name="heritrix-job-url" type="string">
|
||||
<xsl:value-of select="/root/engine/jobs/value[shortName=/root/action/@uuid]/url"/>
|
||||
</parameter>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="data-access-data"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" transform="oxf:xslt" href="#data-access-data">
|
||||
<config xsl:version="2.0">
|
||||
<relpath>
|
||||
<xsl:value-of select="/config/parameter[@name='directory']"/>
|
||||
<xsl:text>index.xml</xsl:text>
|
||||
</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param" transform="oxf:xslt" href="#data-access-data">
|
||||
<archive-set xsl:version="2.0" url="{/config/parameter[@name='url']}" uuid="{/config/parameter[@name='uuid']}">
|
||||
<heritrix-job url="{/config/parameter[@name='heritrix-job-url']}"/>
|
||||
</archive-set>
|
||||
</p:input>
|
||||
<p:output name="data" id="response2" debug="response2"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response2"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="/data-access.xpl"/>
|
||||
<p:input name="data" href="#data-access-data"/>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority=$(priority-warc) uuid="{util:uuid()}" type="get-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) after=$(next-time)/>
|
||||
into $q,
|
||||
|
||||
for $a in /queue/action where $a/@uuid = $(uuid) return
|
||||
update
|
||||
delete $a
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>-\->
|
||||
|
||||
-->
|
||||
</p:config>
|
|
@ -1,51 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
|
||||
version="2.0">
|
||||
<xd:doc scope="stylesheet">
|
||||
<xd:desc>
|
||||
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
|
||||
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||
<xd:p>See https://webarchive.jira.com/wiki/display/Heritrix/Logs</xd:p>
|
||||
</xd:desc>
|
||||
</xd:doc>
|
||||
|
||||
|
||||
<xsl:template match="/document">
|
||||
<log>
|
||||
<xsl:for-each select="tokenize(., '\n')[. != '']">
|
||||
<entry>
|
||||
<date-time>
|
||||
<xsl:value-of select="substring(., 1, 24)"/>
|
||||
</date-time>
|
||||
<code>
|
||||
<xsl:value-of select="normalize-space(substring(., 26, 5))"/>
|
||||
</code>
|
||||
<size>
|
||||
<xsl:value-of select="normalize-space(substring(., 33, 10))"/>
|
||||
</size>
|
||||
<xsl:variable name="tail" select="substring(., 43)"/>
|
||||
<xsl:variable name="tokens" select="tokenize($tail, ' ')"/>
|
||||
<uri>
|
||||
<xsl:value-of select="$tokens[1]"/>
|
||||
</uri>
|
||||
<discovery-path>
|
||||
<xsl:value-of select="$tokens[2]"/>
|
||||
</discovery-path>
|
||||
<referer>
|
||||
<xsl:value-of select="$tokens[3]"/>
|
||||
</referer>
|
||||
<content-type>
|
||||
<xsl:value-of select="$tokens[4]"/>
|
||||
</content-type>
|
||||
<timestamp>
|
||||
<xsl:value-of select="$tokens[6]"/>
|
||||
</timestamp>
|
||||
<sha1-digest>
|
||||
<xsl:value-of select="$tokens[7]"/>
|
||||
</sha1-digest>
|
||||
</entry>
|
||||
</xsl:for-each>
|
||||
</log>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
|
@ -1,119 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:owk="http://owark.org/xslt/" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl"
|
||||
exclude-result-prefixes="xs xd owk" version="2.0">
|
||||
<xd:doc scope="stylesheet">
|
||||
<xd:desc>
|
||||
<xd:p><xd:b>Created on:</xd:b> Apr 26, 2012</xd:p>
|
||||
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||
<xd:p>Create a resource index with links and local names from the Heritrix crawl log in XML format</xd:p>
|
||||
</xd:desc>
|
||||
</xd:doc>
|
||||
|
||||
<xsl:variable name="source" select="/"/>
|
||||
|
||||
<xsl:variable name="common-extensions">
|
||||
<content-type name="application/x-shockwave-flash">
|
||||
<extension>swf</extension>
|
||||
</content-type>
|
||||
<content-type name="application/xhtml+xml">
|
||||
<extension>html</extension>
|
||||
</content-type>
|
||||
</xsl:variable>
|
||||
|
||||
<xsl:key name="extension" match="content-type" use="@name"/>
|
||||
|
||||
<xsl:function name="owk:add-extension" as="xs:string">
|
||||
<xsl:param name="entry" as="element(entry)"/>
|
||||
<xsl:param name="last-token" as="xs:string"/>
|
||||
<xsl:sequence select="
|
||||
if (contains($last-token, '.'))
|
||||
then ''
|
||||
else concat('.',
|
||||
if (key('extension', $entry/content-type, $common-extensions))
|
||||
then key('extension', $entry/content-type, $common-extensions)
|
||||
else if (contains($entry/content-type, '/'))
|
||||
then substring-after($entry/content-type, '/')
|
||||
else 'unknown') "/>
|
||||
</xsl:function>
|
||||
|
||||
|
||||
<xsl:function name="owk:local-name" as="xs:string">
|
||||
<xsl:param name="entry" as="element(entry)"/>
|
||||
<xsl:variable name="is-seed" select="$entry/discovery-path='-'"/>
|
||||
<xsl:variable name="tokens" select="tokenize(if (contains($entry/uri, '?')) then substring-before($entry/uri, '?') else $entry/uri, '/')"/>
|
||||
<xsl:sequence
|
||||
select="if ($is-seed)
|
||||
then 'index.html'
|
||||
else concat(
|
||||
$tokens[3],
|
||||
'/',
|
||||
if ($tokens[last()] = '') then 'index' else $tokens[last()],
|
||||
owk:add-extension($entry, $tokens[last()]))"
|
||||
/>
|
||||
</xsl:function>
|
||||
|
||||
<xsl:function name="owk:unique-local-name" as="xs:string">
|
||||
<xsl:param name="entry" as="element(entry)"/>
|
||||
<xsl:variable name="local-name" select="owk:local-name($entry)"/>
|
||||
<xsl:sequence
|
||||
select="if (count(key('entry-by-name', $local-name, $source)) = 1)
|
||||
then $local-name
|
||||
else concat(
|
||||
substring-before($local-name, '/'),
|
||||
'/',
|
||||
substring-before(substring-after($local-name, '/'), '.'),
|
||||
'-',
|
||||
count($entry/preceding-sibling::entry[owk:local-name(.) = $local-name]) + 1,
|
||||
'.',
|
||||
substring-after(substring-after($local-name, '/'), '.')
|
||||
)"
|
||||
/>
|
||||
</xsl:function>
|
||||
|
||||
<xsl:key name="entry-by-name" match="entry[substring-before(uri, '://') = ('http', 'https')]" use="owk:local-name(.)"/>
|
||||
|
||||
<xsl:template match="/log">
|
||||
<index>
|
||||
<xsl:apply-templates select="entry[substring-before(uri, '://') = ('http', 'https') and code = 200]"/>
|
||||
</index>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="entry">
|
||||
<resource>
|
||||
<xsl:variable name="is-seed" select="discovery-path='-'"/>
|
||||
<uri seed="{$is-seed}">
|
||||
<xsl:value-of select="uri"/>
|
||||
</uri>
|
||||
<local-name>
|
||||
<xsl:value-of select="owk:unique-local-name(.)"/>
|
||||
</local-name>
|
||||
<type>
|
||||
<xsl:choose>
|
||||
<xsl:when test="content-type = 'text/html'">html</xsl:when>
|
||||
<xsl:when test="content-type = 'application/xhtml+xml'">html</xsl:when>
|
||||
<xsl:when test="content-type = 'text/plain'">text</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="substring-after(content-type, '/')"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</type>
|
||||
<xsl:apply-templates select="." mode="redirect"/>
|
||||
<xsl:apply-templates select="/log/entry[referer = current()/uri and ends-with(discovery-path, 'E')]" mode="embedding"/>
|
||||
</resource>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*" mode="redirect"/>
|
||||
<xsl:template match="entry[ends-with(discovery-path, 'R')]" mode="redirect">
|
||||
<same-as seed="{discovery-path='-'}">
|
||||
<xsl:value-of select="referer"/>
|
||||
</same-as>
|
||||
<xsl:apply-templates select="/log/entry[uri = current()/referer]" mode="redirect"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="entry" mode="embedding">
|
||||
<embeds>
|
||||
<xsl:value-of select="uri"/>
|
||||
</embeds>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
|
@ -1,103 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" exclude-result-prefixes="xs xd"
|
||||
version="2.0">
|
||||
<xd:doc scope="stylesheet">
|
||||
<xd:desc>
|
||||
<xd:p><xd:b>Created on:</xd:b> Apr 13, 2012</xd:p>
|
||||
<xd:p><xd:b>Author:</xd:b> vdv</xd:p>
|
||||
<xd:p>Template library to produce WARC documents</xd:p>
|
||||
</xd:desc>
|
||||
</xd:doc>
|
||||
|
||||
<xsl:variable name="CRLF" select="' '"/>
|
||||
<xsl:variable name="version">WARC/0.18</xsl:variable>
|
||||
<xsl:template match="CRLF" name="CRLF" mode="warc">
|
||||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
<xsl:template match="version" name="version" mode="warc">
|
||||
<xsl:value-of select="$version"/>
|
||||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
<xsl:template match="field" mode="warc">
|
||||
<xsl:value-of select="name"/>
|
||||
<xsl:text>: </xsl:text>
|
||||
<xsl:value-of select="value"/>
|
||||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
<xsl:template match="line" mode="warc">
|
||||
<xsl:value-of select="."/>
|
||||
<xsl:value-of select="$CRLF"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="record" mode="warc">
|
||||
<xsl:param name="document-length" as="xs:integer" select="0" tunnel="yes"/>
|
||||
<xsl:call-template name="CRLF"/>
|
||||
<xsl:call-template name="CRLF"/>
|
||||
<xsl:apply-templates select="header" mode="warc"/>
|
||||
<xsl:variable name="block">
|
||||
<xsl:apply-templates select="block" mode="warc"/>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="content-length">
|
||||
<field>
|
||||
<name>Content-Length</name>
|
||||
<value>
|
||||
<xsl:value-of select="string-length($block) + $document-length "/>
|
||||
</value>
|
||||
</field>
|
||||
</xsl:variable>
|
||||
<xsl:apply-templates select="$content-length" mode="warc"/>
|
||||
<xsl:call-template name="CRLF"/>
|
||||
<xsl:value-of select="$block"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="block" mode="warc">
|
||||
<xsl:apply-templates mode="warc"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="header" mode="warc">
|
||||
<xsl:call-template name="version"/>
|
||||
<xsl:apply-templates select="*" mode="warc"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="request" mode="warc-http">
|
||||
<line>
|
||||
<xsl:value-of select="method"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<xsl:value-of select="location"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<!-- TODO: get the HTTP version -->
|
||||
<xsl:text>HTTP/1.0</xsl:text>
|
||||
</line>
|
||||
<xsl:apply-templates select="header" mode="warc-http"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="response" mode="warc-http">
|
||||
<!--<xsl:message>
|
||||
<xsl:value-of select="string-length(document)"/>
|
||||
<xsl:text> - </xsl:text>
|
||||
<xsl:value-of select="string-length(translate(document, ' 

', ''))"/>
|
||||
</xsl:message>-->
|
||||
<line>
|
||||
<!-- TODO: get the HTTP version and status-->
|
||||
<xsl:text>HTTP/1.1 </xsl:text>
|
||||
<xsl:value-of select="code"/>
|
||||
<xsl:text> OK</xsl:text>
|
||||
</line>
|
||||
<xsl:apply-templates select="header" mode="warc-http"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="header" mode="warc-http">
|
||||
<field>
|
||||
<name>
|
||||
<xsl:value-of select="@name"/>
|
||||
</name>
|
||||
<value>
|
||||
<xsl:value-of select="."/>
|
||||
</value>
|
||||
</field>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()" mode="warc warc-http"/>
|
||||
|
||||
|
||||
</xsl:stylesheet>
|
|
@ -1,15 +0,0 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<config>
|
||||
|
||||
<exist-root>http://admin@localhost:8080/orbeon/exist/rest/db/</exist-root>
|
||||
<exist-db>owark/</exist-db>
|
||||
|
||||
<user-agent>Mozilla/5.0 (compatible; owark/0.3; http://owark.org/)</user-agent>
|
||||
|
||||
<heritrix>
|
||||
<rest-api>https://localhost:8443/engine</rest-api>
|
||||
<username>admin</username>
|
||||
<password>envierse</password>
|
||||
</heritrix>
|
||||
|
||||
</config>
|
|
@ -1,6 +0,0 @@
|
|||
<processors xmlns:owk="http://owark.org/orbeon/processors">
|
||||
<processor name="owk:from-warc-converter">
|
||||
<class name="org.owark.orbeon.FromWarcConverter"/>
|
||||
</processor>
|
||||
</processors>
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
<!--
|
||||
|
||||
Database access
|
||||
|
||||
-->
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xforms="http://www.w3.org/2002/xforms" xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:saxon="http://saxon.sf.net/">
|
||||
|
||||
|
||||
<p:param name="param" type="input"/>
|
||||
<!-- XQuery request or default document to read when not found -->
|
||||
<p:param name="data" type="input"/>
|
||||
<!-- Request description :
|
||||
<config>
|
||||
<relpath>Relatuve path</relpath>
|
||||
<operation>read|write</operation>
|
||||
<type>xquery|document</type>
|
||||
<parameter></parameter>
|
||||
<parameter></parameter>
|
||||
</config>
|
||||
-->
|
||||
<p:param name="data" type="output"/>
|
||||
|
||||
|
||||
<p:choose href="#data">
|
||||
<p:when test="/config/type = 'document' and /config/operation='read'">
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#data"/>
|
||||
<p:input name="config.xml" href="oxf:/config.xml"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="config" select="doc('input:config.xml')/config"/>
|
||||
<xforms:submission method="get" replace="none" action="{$config/exist-root}{$config/exist-db}{/config/relpath}"/>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="submission"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" href="#submission"/>
|
||||
<p:input name="request" href="#param"/>
|
||||
<p:output name="response" id="document"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:exception-catcher">
|
||||
<p:input name="data" href="#document"/>
|
||||
<p:output name="data" id="document-exception"/>
|
||||
</p:processor>
|
||||
<p:choose href="#document-exception">
|
||||
<p:when test="/exceptions">
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="#param"/>
|
||||
<p:output name="data" ref="data"/>
|
||||
</p:processor>
|
||||
</p:when>
|
||||
<p:otherwise>
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data" href="#document-exception"/>
|
||||
<p:output name="data" ref="data"/>
|
||||
</p:processor>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
|
||||
</p:when>
|
||||
<p:when test="/config/type = 'document' and /config/operation='write'">
|
||||
<p:processor name="oxf:xslt">
|
||||
<p:input name="data" href="#data"/>
|
||||
<p:input name="config.xml" href="oxf:/config.xml"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="config" select="doc('input:config.xml')/config"/>
|
||||
<xforms:submission method="put" replace="none" action="{$config/exist-root}{$config/exist-db}{/config/relpath}"/>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="submission"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" href="#submission"/>
|
||||
<p:input name="request" href="#param"/>
|
||||
<p:output name="response" ref="data"/>
|
||||
</p:processor>
|
||||
</p:when>
|
||||
|
||||
<p:when test="/config/type = 'xquery' ">
|
||||
|
||||
<p:processor name="oxf:unsafe-xslt">
|
||||
<p:input name="data" href="#data"/>
|
||||
<p:input name="config.xml" href="oxf:/config.xml"/>
|
||||
<p:input name="param" href="#param"/>
|
||||
<p:input name="config">
|
||||
<xsl:stylesheet version="2.0">
|
||||
<xsl:output name="output" method="xml" omit-xml-declaration="yes"/>
|
||||
<xsl:template match="/">
|
||||
<xsl:variable name="query">
|
||||
<xsl:variable name="data" select="/"/>
|
||||
<xsl:analyze-string select="string(doc('input:param'))" regex="\$\((\i\c*)\)" flags="">
|
||||
<xsl:matching-substring>
|
||||
<xsl:variable name="parameter" select="$data/config/parameter[@name = regex-group(1)]"/>
|
||||
<xsl:variable name="sanitized" select="if ($parameter/@type = 'node-set') then saxon:serialize($parameter/*, 'output') else replace(replace($parameter, '&', '&amp;'), '''', '&apos;')"/>
|
||||
<xsl:choose>
|
||||
<xsl:when test="not($parameter)">
|
||||
<xsl:message terminate="yes">Parameter <xsl:value-of select="regex-group(1)"/> not found in query <xsl:value-of select="doc('input:param')"
|
||||
/></xsl:message>
|
||||
</xsl:when>
|
||||
<xsl:when test="$parameter/@type='string'">
|
||||
<xsl:text>'</xsl:text>
|
||||
<xsl:value-of select="$sanitized"/>
|
||||
<xsl:text>'</xsl:text>
|
||||
</xsl:when>
|
||||
<xsl:when test="$parameter/@type='node-set'">
|
||||
<xsl:copy-of select="$sanitized"/>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="$parameter/@type"/>
|
||||
<xsl:text>('</xsl:text>
|
||||
<xsl:value-of select="$sanitized"/>
|
||||
<xsl:text>')</xsl:text>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:matching-substring>
|
||||
<xsl:non-matching-substring>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:non-matching-substring>
|
||||
</xsl:analyze-string>
|
||||
</xsl:variable>
|
||||
<xsl:message>
|
||||
<xsl:value-of select="$query"/>
|
||||
</xsl:message>
|
||||
<xsl:variable name="config" select="doc('input:config.xml')/config"/>
|
||||
<xforms:submission method="get" replace="none"
|
||||
action="{$config/exist-root}{$config/exist-db}{/config/relpath}?_howmany=10000&_query={encode-for-uri(normalize-space($query))}"/>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
</p:input>
|
||||
<p:output name="data" id="submission"/>
|
||||
</p:processor>
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" href="#submission"/>
|
||||
<p:input name="request" href="#param"/>
|
||||
<p:output name="response" ref="data"/>
|
||||
</p:processor>
|
||||
|
||||
</p:when>
|
||||
|
||||
<p:otherwise>
|
||||
<p:processor name="oxf:identity">
|
||||
<p:input name="data">
|
||||
<not-implemented/>
|
||||
</p:input>
|
||||
<p:output name="data" ref="data"/>
|
||||
</p:processor>
|
||||
</p:otherwise>
|
||||
</p:choose>
|
||||
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,37 +0,0 @@
|
|||
<!--
|
||||
Copyright (C) 2004 Orbeon, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU Lesser General Public License for more details.
|
||||
|
||||
The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
-->
|
||||
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/" xmlns="http://jakarta.apache.org/log4j/">
|
||||
|
||||
<!-- This is the standard log appender to the console (System.out) -->
|
||||
<appender name="ConsoleAppender" class="org.apache.log4j.ConsoleAppender">
|
||||
<param name="Target" value="System.err"/>
|
||||
<layout class="org.apache.log4j.PatternLayout">
|
||||
<param name="ConversionPattern" value="%d{ISO8601} %-5p %c %x - %m%n"/>
|
||||
</layout>
|
||||
<filter class="org.apache.log4j.varia.LevelRangeFilter">
|
||||
<param name="LevelMin" value="INFO"/>
|
||||
</filter>
|
||||
</appender>
|
||||
<!-- XForms engine activity -->
|
||||
<category name="org.orbeon.oxf.xforms.processor.XFormsServer">
|
||||
<priority value="debug"/>
|
||||
</category>
|
||||
|
||||
<!-- This is the root logger -->
|
||||
<root>
|
||||
<priority value="debug"/>
|
||||
<appender-ref ref="ConsoleAppender"/>
|
||||
</root>
|
||||
|
||||
</log4j:configuration>
|
|
@ -1,65 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Database creation
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="data-access.xpl"/>
|
||||
<p:input name="data">
|
||||
<config>
|
||||
<relpath>index.xhtml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<html xml:lang="fr" >
|
||||
<head>
|
||||
<title>Owark DB</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Owark db</p>
|
||||
</body>
|
||||
</html>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="data-access.xpl"/>
|
||||
<p:input name="data">
|
||||
<config>
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>document</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<queue/>
|
||||
</p:input>
|
||||
<p:output name="data" id="response2" debug="response2"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response2"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
<!-- Indexes -->
|
||||
<!--<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="create-indexes.xpl"/>
|
||||
</p:processor>
|
||||
-->
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,39 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Post an archive request
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="data-access.xpl"/>
|
||||
<p:input name="data">
|
||||
<config>
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>write</operation>
|
||||
<type>xquery</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
declare namespace util = "http://exist-db.org/xquery/util";
|
||||
|
||||
for $q in /queue return
|
||||
update
|
||||
insert <action priority="0" uuid="{util:uuid()}" type="heritrix-archive-set" url="http://xmlfr.org/"/>
|
||||
into $q
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="response" debug="response"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,27 +0,0 @@
|
|||
<!--
|
||||
Copyright (C) 2004 Orbeon, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
GNU Lesser General Public License as published by the Free Software Foundation; either version
|
||||
2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
See the GNU Lesser General Public License for more details.
|
||||
|
||||
The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
|
||||
-->
|
||||
<properties xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:oxf="http://www.orbeon.com/oxf/processors">
|
||||
|
||||
<property as="xs:anyURI" name="oxf.log4j-config" value="oxf:/default-log4j.xml"/>
|
||||
<property as="xs:integer" name="oxf.cache.size" value="200"/>
|
||||
|
||||
|
||||
<property as="xs:anyURI" name="oxf.http.ssl.keystore.uri" value="file:/var/local/heritrix-3.1.0/heritrix.keystore"/>
|
||||
<property as="xs:string" name="oxf.http.ssl.keystore.password" value="heritrix"/>
|
||||
<property as="xs:string" name="oxf.http.ssl.hostname-verifier" value="allow-all"/>
|
||||
|
||||
<!--<property as="xs:NMTOKENS" name="oxf.xforms.logging.debug"
|
||||
value="document model submission control event action analysis server server-body html resolver utils
|
||||
submission-details submission-body"/>-->
|
||||
</properties>
|
|
@ -1,21 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Reinstall the database
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="uninstall-db.xpl" />
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="install-db.xpl" />
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,54 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Scheduler
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline" xmlns:oxf="http://www.orbeon.com/oxf/processors" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:exist="http://exist.sourceforge.net/NS/exist" xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="data-access.xpl"/>
|
||||
<p:input name="data">
|
||||
<config>
|
||||
<relpath>queue.xml</relpath>
|
||||
<operation>read</operation>
|
||||
<type>xquery</type>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:input name="param">
|
||||
<xquery><![CDATA[
|
||||
|
||||
/queue/action[not(@after) or xs:dateTime(@after) < current-dateTime()][@priority=max(/queue/action[not(@after) or xs:dateTime(@after) < current-dateTime()]/@priority)]
|
||||
|
||||
]]></xquery>
|
||||
</p:input>
|
||||
<p:output name="data" id="actions" debug="actions"/>
|
||||
</p:processor>
|
||||
|
||||
<p:for-each href="#actions" select="/*/action">
|
||||
|
||||
<p:processor name="oxf:url-generator">
|
||||
<p:input name="config" transform="oxf:xslt" href="current()">
|
||||
<config xsl:version="2.0">
|
||||
<url>
|
||||
<xsl:text>oxf:/actions/</xsl:text>
|
||||
<!-- Remove / and \ for security reasons -->
|
||||
<xsl:value-of select="translate(/action/@type, '/\', '')"/>
|
||||
<xsl:text>.xpl</xsl:text>
|
||||
</url>
|
||||
</config>
|
||||
</p:input>
|
||||
<p:output name="data" id="pipeline"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:pipeline">
|
||||
<p:input name="config" href="#pipeline"/>
|
||||
<p:input name="data" href="current()"/>
|
||||
</p:processor>
|
||||
|
||||
</p:for-each>
|
||||
|
||||
|
||||
</p:config>
|
|
@ -1,104 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:exsl="http://exslt.org/common" extension-element-prefixes="exsl" xmlns:xsltu="http://xsltunit.org/0/"
|
||||
xmlns:owk="http://owark.org/xslt/" exclude-result-prefixes="exsl">
|
||||
<xsl:import href="../actions/mediatypes/common-rewrite.xsl"/>
|
||||
<xsl:import href="xsltunit.xsl"/>
|
||||
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
|
||||
<xsl:variable name="index" select="doc('local-names.xml')/index"/>
|
||||
<xsl:variable name="resource" select="$index/resource[uri='http://gmpg.org/xfn/11']"/>
|
||||
<xsl:key name="log-by-uri" match="/log/entry" use="uri"/>
|
||||
<xsl:template match="/">
|
||||
<xsltu:tests>
|
||||
<xsltu:test id="is-relative1">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'is-relative'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<is-relative>true</is-relative>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<is-relative>
|
||||
<xsl:value-of select="owk:is-relative('/foo')"/>
|
||||
</is-relative>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
<xsltu:test id="is-relative2">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'is-relative'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<is-relative>false</is-relative>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<is-relative>
|
||||
<xsl:value-of select="owk:is-relative('http://example.com/foo')"/>
|
||||
</is-relative>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
<xsltu:test id="safer-resolve-uri1">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'is-relative'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<uri>http://example.com/foo</uri>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<uri>
|
||||
<xsl:value-of select="owk:safer-resolve-uri('/foo', 'http://example.com/')"/>
|
||||
</uri>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
<xsltu:test id="safer-resolve-uri2">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'is-relative'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<uri>http://owark.org/foo</uri>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<uri>
|
||||
<xsl:value-of select="owk:safer-resolve-uri('http://owark.org/foo', 'http://example.com/')"/>
|
||||
</uri>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
<xsltu:test id="safer-resolve-uri3">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'is-relative'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<uri>http://owark.org/foo{{{{}}}}</uri>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<uri>
|
||||
<xsl:value-of select="owk:safer-resolve-uri('http://owark.org/foo{{{{}}}}', 'http://example.com/')"/>
|
||||
</uri>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
<xsltu:test id="url-rewrite">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'rewrite1'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<uri>http://gmpg.org/foo</uri>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<uri>
|
||||
<xsl:value-of select="owk:url-rewrite('/foo')"/>
|
||||
</uri>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="'rewrite2'"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<uri>../gmpg.org/11-1.html</uri>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<uri>
|
||||
<xsl:value-of select="owk:url-rewrite('/xfn/11')"/>
|
||||
</uri>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
|
||||
</xsltu:tests>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
File diff suppressed because it is too large
Load Diff
|
@ -1,28 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:exsl="http://exslt.org/common" extension-element-prefixes="exsl" xmlns:xsltu="http://xsltunit.org/0/"
|
||||
xmlns:owk="http://owark.org/xslt/" exclude-result-prefixes="exsl">
|
||||
<xsl:import href="../actions/resource-index.xslt"/>
|
||||
<xsl:import href="xsltunit.xsl"/>
|
||||
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
|
||||
<xsl:variable name="local-names" select="doc('local-names.xml')/index"/>
|
||||
<xsl:key name="log-by-uri" match="/log/entry" use="uri"/>
|
||||
<xsl:template match="/">
|
||||
<xsltu:tests>
|
||||
<xsl:for-each select="$local-names/resource">
|
||||
<xsltu:test id="{uri}">
|
||||
<xsl:call-template name="xsltu:assertEqual">
|
||||
<xsl:with-param name="id" select="uri"/>
|
||||
<xsl:with-param name="nodes1">
|
||||
<local-name>
|
||||
<xsl:value-of select="owk:unique-local-name(key('log-by-uri', current()/uri, $source ))"/>
|
||||
</local-name>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="nodes2">
|
||||
<xsl:copy-of select="local-name"/>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsltu:test>
|
||||
</xsl:for-each>
|
||||
</xsltu:tests>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
File diff suppressed because it is too large
Load Diff
|
@ -1,158 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:exsl="http://exslt.org/common"
|
||||
extension-element-prefixes="exsl"
|
||||
xmlns:xsltu="http://xsltunit.org/0/"
|
||||
exclude-result-prefixes="exsl">
|
||||
|
||||
<xsl:template name="xsltu:assertEqual">
|
||||
<xsl:param name="id"/>
|
||||
<xsl:param name="nodes1"/>
|
||||
<xsl:param name="nodes2"/>
|
||||
<xsl:variable name="result">
|
||||
<xsl:call-template name="xsltu:diff">
|
||||
<xsl:with-param name="nodes1" select="exsl:node-set($nodes1)"/>
|
||||
<xsl:with-param name="nodes2" select="exsl:node-set($nodes2)"/>
|
||||
</xsl:call-template>
|
||||
</xsl:variable>
|
||||
<xsl:call-template name="xsltu:assert">
|
||||
<xsl:with-param name="id" select="$id"/>
|
||||
<xsl:with-param name="test" select="not(exsl:node-set($result)//xsltu:no-match)"/>
|
||||
<xsl:with-param name="message" select="exsl:node-set($result)"/>
|
||||
</xsl:call-template>
|
||||
</xsl:template>
|
||||
<xsl:template name="xsltu:assertNotEqual">
|
||||
<xsl:param name="id"/>
|
||||
<xsl:param name="nodes1"/>
|
||||
<xsl:param name="nodes2"/>
|
||||
<xsl:variable name="result">
|
||||
<xsl:call-template name="xsltu:diff">
|
||||
<xsl:with-param name="nodes1" select="exsl:node-set($nodes1)"/>
|
||||
<xsl:with-param name="nodes2" select="exsl:node-set($nodes2)"/>
|
||||
</xsl:call-template>
|
||||
</xsl:variable>
|
||||
<xsl:call-template name="xsltu:assert">
|
||||
<xsl:with-param name="id" select="$id"/>
|
||||
<xsl:with-param name="test" select="exsl:node-set($result)//xsltu:no-match"/>
|
||||
<xsl:with-param name="message">Should have been different!</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsl:template>
|
||||
<xsl:template name="xsltu:assert">
|
||||
<xsl:param name="id"/>
|
||||
<xsl:param name="test"/>
|
||||
<xsl:param name="message"/>
|
||||
<xsltu:assert id="{$id}">
|
||||
<xsl:choose>
|
||||
<xsl:when test="$test">
|
||||
<xsl:attribute name="outcome">passed</xsl:attribute>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:attribute name="outcome">failed</xsl:attribute>
|
||||
<xsltu:message>
|
||||
<xsl:copy-of select="$message"/>
|
||||
</xsltu:message>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsltu:assert>
|
||||
</xsl:template>
|
||||
<xsl:template name="xsltu:diff">
|
||||
<xsl:param name="nodes1"/>
|
||||
<xsl:param name="nodes2"/>
|
||||
<xsltu:diff name="{name($nodes1)}">
|
||||
<xsl:choose>
|
||||
<xsl:when test="self::* and (local-name($nodes1) != local-name($nodes2) or namespace-uri($nodes1) != namespace-uri($nodes2))">
|
||||
<xsltu:no-match diff="names">
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes1"/>
|
||||
</xsltu:node>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes2"/>
|
||||
</xsltu:node>
|
||||
</xsltu:no-match>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($nodes1/@*) != count($nodes2/@*)">
|
||||
<xsltu:no-match diff="number of children attributes ({count($nodes1/@*)} versus {count($nodes2/@*)} )">
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes1"/>
|
||||
</xsltu:node>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes2"/>
|
||||
</xsltu:node>
|
||||
</xsltu:no-match>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($nodes1/*) != count($nodes2/*)">
|
||||
<xsltu:no-match diff="number of children elements ({count($nodes1/*)} versus {count($nodes2/*)} )">
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes1"/>
|
||||
</xsltu:node>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes2"/>
|
||||
</xsltu:node>
|
||||
</xsltu:no-match>
|
||||
</xsl:when>
|
||||
<xsl:when test="count($nodes1/text()) != count($nodes2/text())">
|
||||
<xsltu:no-match diff="number of children text nodes ({count($nodes1/text())} versus {count($nodes2/text())} )">
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes1"/>
|
||||
</xsltu:node>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$nodes2"/>
|
||||
</xsltu:node>
|
||||
</xsltu:no-match>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:apply-templates select="$nodes1/@*" mode="xsltu:diff">
|
||||
<xsl:with-param name="nodes2" select="$nodes2"/>
|
||||
</xsl:apply-templates>
|
||||
<xsl:apply-templates select="$nodes1/*" mode="xsltu:diff">
|
||||
<xsl:with-param name="nodes2" select="$nodes2"/>
|
||||
</xsl:apply-templates>
|
||||
<xsl:apply-templates select="$nodes1/text()" mode="xsltu:diff">
|
||||
<xsl:with-param name="nodes2" select="$nodes2"/>
|
||||
</xsl:apply-templates>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsltu:diff>
|
||||
</xsl:template>
|
||||
<xsl:template match="*" mode="xsltu:diff">
|
||||
<xsl:param name="pos" select="position()"/>
|
||||
<xsl:param name="nodes2"/>
|
||||
<xsl:param name="node2" select="$nodes2/*[position()=$pos]"/>
|
||||
<xsl:call-template name="xsltu:diff">
|
||||
<xsl:with-param name="nodes1" select="."/>
|
||||
<xsl:with-param name="nodes2" select="$node2"/>
|
||||
</xsl:call-template>
|
||||
</xsl:template>
|
||||
<xsl:template match="text()" mode="xsltu:diff">
|
||||
<xsl:param name="current" select="."/>
|
||||
<xsl:param name="pos" select="position()"/>
|
||||
<xsl:param name="nodes2"/>
|
||||
<xsl:param name="node2" select="$nodes2/text()[position()=$pos]"/>
|
||||
<xsl:if test="not(. = $node2)">
|
||||
<xsltu:no-match>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="."/>
|
||||
</xsltu:node>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$node2"/>
|
||||
</xsltu:node>
|
||||
</xsltu:no-match>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template match="@*" mode="xsltu:diff">
|
||||
<xsl:param name="current" select="."/>
|
||||
<xsl:param name="nodes2"/>
|
||||
<xsl:param name="node2" select="$nodes2/@*[local-name() = local-name(current()) and namespace-uri() = namespace-uri(current())]"/>
|
||||
<xsl:if test="not(. = $node2)">
|
||||
<xsltu:no-match>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="."/>
|
||||
</xsltu:node>
|
||||
<xsltu:node>
|
||||
<xsl:copy-of select="$node2"/>
|
||||
</xsltu:node>
|
||||
</xsltu:no-match>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -1,33 +0,0 @@
|
|||
|
||||
<!--
|
||||
|
||||
Remove the database
|
||||
|
||||
-->
|
||||
|
||||
<p:config xmlns:p="http://www.orbeon.com/oxf/pipeline"
|
||||
xmlns:oxf="http://www.orbeon.com/oxf/processors"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xforms="http://www.w3.org/2002/xforms"
|
||||
xmlns:xxforms="http://orbeon.org/oxf/xml/xforms"
|
||||
xmlns:exist="http://exist.sourceforge.net/NS/exist"
|
||||
xmlns:pipeline="java:org.orbeon.oxf.processor.pipeline.PipelineFunctionLibrary">
|
||||
|
||||
<p:processor name="oxf:xforms-submission">
|
||||
<p:input name="submission" href="oxf:/config.xml"
|
||||
transform="oxf:xslt">
|
||||
<xforms:submission xsl:version="2.0" method="delete"
|
||||
action="{/config/exist-root}{/config/exist-db}"
|
||||
/>
|
||||
</p:input>
|
||||
<p:input name="request">
|
||||
<empty/>
|
||||
</p:input>
|
||||
<p:output name="response" id="response1"/>
|
||||
</p:processor>
|
||||
|
||||
<p:processor name="oxf:null-serializer">
|
||||
<p:input name="data" href="#response1"/>
|
||||
</p:processor>
|
||||
|
||||
|
||||
</p:config>
|
Loading…
Reference in New Issue