/** * Copyright (C) 2012 Eric van der Vlist. * * This program is free software; you can redistribute it and/or modify it under the terms of the * GNU Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The full text of the license is available at http://www.gnu.org/copyleft/lesser.html */ package org.owark.warc; import org.junit.Assert; import org.junit.Test; import java.io.*; /** * Test cases for WarcParser */ public class WarcParserTest { private static WarcParser warcParser; @Test public void testDyomedea() throws IOException, WarcParser.WarcException { // WARC File file = new File("/home/vdv/projects/owark/archiver/java/test/org/owark/warc/dyomedea.warc"); WarcParser warcParser = new WarcParser(new FileInputStream(file)); Assert.assertEquals(true, warcParser.hasNext()); // RECORD (warcinfo) WarcRecord record = warcParser.next(); Assert.assertEquals("WARC/1.0", warcParser.getMagic()); Assert.assertNotNull(record); Assert.assertEquals("WARC/1.0", record.getMagic()); // HEADER WarcRecordHeader header = record.getHeader(); Assert.assertNotNull(header); Assert.assertNull(header.getType()); Assert.assertEquals(true, header.hasNext()); WarcField headerItem = header.next(); Assert.assertNotNull(headerItem); Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey()); Assert.assertEquals("warcinfo", headerItem.getValue()); Assert.assertEquals("warcinfo", header.getType()); Assert.assertEquals("warcinfo", record.getType()); Assert.assertEquals(true, header.hasNext()); headerItem = header.next(); Assert.assertNotNull(headerItem); Assert.assertEquals("WARC-Date", headerItem.getKey()); Assert.assertEquals("2012-04-23T10:05:24Z", headerItem.getValue()); headerItem = header.next(); headerItem = header.next(); headerItem = header.next(); headerItem = header.next(); Assert.assertNotNull(headerItem); Assert.assertEquals("Content-Length", headerItem.getKey()); Assert.assertEquals("369", headerItem.getValue()); Assert.assertEquals(false, header.hasNext()); headerItem = header.next(); Assert.assertNull(headerItem); Assert.assertEquals("application/warc-fields", record.getContentType()); Assert.assertEquals(369, record.getContentLength()); // Content WarcRecordContent content = record.getContent(); Assert.assertNotNull(content); Assert.assertEquals(true, content.hasFields()); Assert.assertEquals(false, content.isHTTP()); Assert.assertEquals(false, content.hasStatusLine()); Assert.assertEquals(false, content.hasRequestLine()); Assert.assertEquals(true, content.hasNext()); WarcField field = content.next(); Assert.assertEquals(false, content.endOfContent()); Assert.assertNotNull(field); Assert.assertEquals("software", field.getKey()); Assert.assertEquals("Heritrix/3.1.0 http://crawler.archive.org", field.getValue()); field = content.next(); field = content.next(); field = content.next(); field = content.next(); field = content.next(); field = content.next(); field = content.next(); field = content.next(); Assert.assertNotNull(field); Assert.assertEquals("http-header-user-agent", field.getKey()); Assert.assertEquals("Mozilla/5.0 (compatible; heritrix/3.1.0 +http://owark.org)", field.getValue()); Assert.assertEquals(false, content.hasNext()); Assert.assertNull(content.getPayloadContentType()); Assert.assertNull(content.getPayloadContentHeader()); Assert.assertNull(content.getPayloadEncoding()); Assert.assertEquals(true, content.endOfContent()); // Next record (DNS response) Assert.assertEquals(true, warcParser.hasNext()); record = warcParser.next(); Assert.assertNotNull(record); // Header header = record.getHeader(); Assert.assertNotNull(header); Assert.assertNull(header.getType()); Assert.assertEquals(true, header.hasNext()); headerItem = header.next(); Assert.assertNotNull(headerItem); Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey()); Assert.assertEquals("response", headerItem.getValue()); header.skipToEnd(); // Content content = record.getContent(); Assert.assertNotNull(content); Assert.assertEquals(false, content.hasFields()); Assert.assertEquals(false, content.isHTTP()); Assert.assertEquals(false, content.hasStatusLine()); Assert.assertEquals(false, content.hasRequestLine()); Assert.assertEquals(false, content.endOfContent()); BufferedReader reader = new BufferedReader(new InputStreamReader(content, "UTF-8")); String line = reader.readLine(); Assert.assertEquals("20120423100524", line); line = reader.readLine(); Assert.assertEquals("dyomedea.com.\t\t1800\tIN\tA\t95.142.167.137", line); line = reader.readLine(); Assert.assertEquals(true, content.endOfContent()); Assert.assertEquals("text/dns", content.getPayloadContentType()); Assert.assertEquals("text/dns", content.getPayloadContentHeader()); Assert.assertNull(content.getPayloadEncoding()); Assert.assertNull(line); // Next record (HTTP response) Assert.assertEquals(true, warcParser.hasNext()); record = warcParser.next(); Assert.assertNotNull(record); // Header header = record.getHeader(); Assert.assertNotNull(header); Assert.assertNull(header.getType()); Assert.assertEquals(true, header.hasNext()); headerItem = header.next(); Assert.assertNotNull(headerItem); Assert.assertEquals(WarcRecordHeader.WARC_TYPE, headerItem.getKey()); Assert.assertEquals("response", headerItem.getValue()); header.skipToEnd(); // Content content = record.getContent(); Assert.assertNotNull(content); Assert.assertEquals(true, content.hasFields()); Assert.assertEquals(true, content.isHTTP()); Assert.assertEquals(false, content.isRequest()); Assert.assertEquals(true, content.hasStatusLine()); Assert.assertEquals(false, content.hasRequestLine()); WarcRecordContent.HttpStatusLine status = content.getStatusLine(); Assert.assertNotNull(status); Assert.assertEquals("HTTP/1.1 404 Introuvable", status.getLine()); Assert.assertEquals("HTTP/1.1", status.getVersion()); Assert.assertEquals("404", status.getStatus()); Assert.assertEquals("Introuvable", status.getReason()); field = content.next(); Assert.assertNotNull(field); Assert.assertEquals("Date", field.getKey()); Assert.assertEquals("Mon, 23 Apr 2012 10:05:27 GMT", field.getValue()); field = content.next(); field = content.next(); field = content.next(); field = content.next(); field = content.next(); field = content.next(); Assert.assertNotNull(field); Assert.assertEquals("Connection", field.getKey()); Assert.assertEquals("close", field.getValue()); Assert.assertEquals(false, content.hasNext()); Assert.assertEquals(false, content.endOfContent()); reader = new BufferedReader(new InputStreamReader(content, "UTF-8")); line = reader.readLine(); Assert.assertEquals("Apache Tomcat/6.0.24 - Rapport d'erreur", line.substring(0, line.indexOf("