From 1033614814087bedcfdf4c9c526e30751892eb22 Mon Sep 17 00:00:00 2001 From: Eric van der Vlist Date: Sat, 4 Jun 2011 20:06:00 +0200 Subject: [PATCH] #4 detection of the encoding used in the archives. --- wordpress/plugins/owark/owark.php | 62 ++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/wordpress/plugins/owark/owark.php b/wordpress/plugins/owark/owark.php index 90d9591..e08e463 100644 --- a/wordpress/plugins/owark/owark.php +++ b/wordpress/plugins/owark/owark.php @@ -32,7 +32,7 @@ if (!class_exists("Owark")) { private $broken_links = array(); private $post_id = -1; private $post_type = ""; - private $version = '0.1'; + private $version = '0.2'; private $notices = ""; /** @@ -97,6 +97,7 @@ if (!class_exists("Owark")) { status varchar(20) NOT NULL DEFAULT 'to-archive', arc_date datetime, arc_location text, + encoding varchar(10), PRIMARY KEY(`id`), KEY `url` (`url`(150)) )"; require_once(ABSPATH . 'wp-admin/includes/upgrade.php'); @@ -425,7 +426,9 @@ if (!class_exists("Owark")) { from {$wpdb->prefix}owark AS owark where id = {$id}"; $link = $wpdb->get_row($query); + $wpdb->flush(); + // Find the file to read $blog_title = get_bloginfo('name'); $home_url = home_url(); @@ -434,17 +437,6 @@ if (!class_exists("Owark")) { $loc = '/wp-content/plugins/owark' . substr($link->arc_location, $pos); $arc_loc = home_url() . $loc; - echo ' -'; - - echo ""; - echo '
'; - echo '
'; - echo "This is an Open Web Archive archive of url}\">{$link->url}."; - echo "
This snapshot has been taken on {$link->arc_date} for the website {$blog_title} which contains a link to this page and has saved a copy to be displayed in the page ever disappears."; - echo '
'; - $file_location = '.'. $loc .'/index.html'; if (!file_exists($file_location)) { // If index.html doesn't exist, find another html file! @@ -458,9 +450,51 @@ if (!class_exists("Owark")) { closedir($dir); } + // Read the file + + $f = fopen($file_location, "r"); + $content = fread($f, filesize($file_location)); + fclose($f); + + // Which encoding? + $encoding = $link->encoding; + + if ($encoding == NULL) { + // We need to guess the encoding! + + $matches = NULL; + // + if (preg_match('/]*charset\s*=\s*([^"\'>]+)\s*["\']/si', + $content, &$matches) > 0) { + $encoding = $matches[1]; + } else { + $encoding = mb_detect_encoding($content); + } + + if ($encoding) { + $wpdb->update( + "{$wpdb->prefix}owark", + array('encoding' => $encoding), + array('id' => $id)); + } + } + + header("Content-Type: text/html; charset=$encoding"); + + echo ' +'; + + echo ""; + echo '
'; + echo '
'; + echo "This is an Open Web Archive archive of url}\">{$link->url}."; + echo "
This snapshot has been taken on {$link->arc_date} for the website {$blog_title} which contains a link to this page and has saved a copy to be displayed in the page ever disappears."; + echo '
'; + + $f = fopen($file_location, "r"); - echo fread($f, filesize($file_location)); - fclose($f); + echo $content; echo '
'; }