
584 lines
20 KiB
Raw Normal View History

2011-06-03 11:34:24 +00:00
/* Copyright 2011-2020 Eric van der Vlist (vdv@dyomedea.com)
2011-06-03 11:34:24 +00:00
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2, as
2011-06-03 11:34:24 +00:00
published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Plugin Name: owark
Plugin URI: http://owark.org
Description: Tired of broken links? Archive yours with owark, the Open Web Archive!
Version: 0.2
2011-06-03 11:34:24 +00:00
Author: Eric van der Vlist
Author URI: http://eric.van-der-vlist.com
License: GLP2
if (!class_exists("Owark")) {
class Owark {
private $broken_links = array();
private $post_id = -1;
private $post_type = "";
private $version = '0.2';
private $notices = "";
2011-06-03 11:34:24 +00:00
* Class constructor
* @package owark
* @since 0.1
function Owark() {
if (is_admin()) {
add_action('admin_menu', array($this, 'owark_admin_menu'));
add_action('plugins_loaded', array($this, 'sanity_checks'));
2011-06-03 11:34:24 +00:00
// See http://stackoverflow.com/questions/2210826/need-help-with-wp-rewrite-in-a-wordpress-plugin
// Using a filter instead of an action to create the rewrite rules.
// Write rules -> Add query vars -> Recalculate rewrite rules
add_filter('rewrite_rules_array', array($this, 'create_rewrite_rules'));
add_filter('query_vars',array($this, 'add_query_vars'));
// Recalculates rewrite rules during admin init to save resources.
// Could probably run it once as long as it isn't going to change or check the
// $wp_rewrite rules to see if it's active.
add_filter('admin_init', array($this, 'flush_rewrite_rules'));
add_action( 'template_redirect', array($this, 'template_redirect_intercept') );
add_filter ( 'the_content', array($this, 'content_filter'));
add_filter ( 'comment_text', array($this, 'comment_filter'));
add_filter ( 'get_comment_author_link', array($this, 'comment_filter'));
add_action('owark_schedule_event', array('Owark', 'schedule'));
if ( !wp_next_scheduled( 'owark_schedule_event', array('occurrences' => 30) ) ) {
wp_schedule_event(time(), 'hourly', 'owark_schedule_event', array('occurrences' => 30));
2011-06-03 11:34:24 +00:00
* Check we have everything we need...
* @package owark
* @since 0.1
function sanity_checks(){
// Install or upgrade tables if needed
$installed_ver = get_option( "owark_db_version" );
if ($installed_ver != $this->version) {
global $wpdb;
$table = $wpdb->prefix."owark";
$sql = "CREATE TABLE $table (
id int(10) unsigned NOT NULL AUTO_INCREMENT,
url text NOT NULL,
status varchar(20) NOT NULL DEFAULT 'to-archive',
arc_date datetime,
arc_location text,
encoding varchar(10),
KEY `url` (`url`(150)) )";
require_once(ABSPATH . 'wp-admin/includes/upgrade.php');
update_option( "owark_db_version", $this->version );
$this->notices = "<div class=\"updated fade\"><p><strong>The owark table has been installed or upgraded to version {$this->version}</strong></p></div>";
// Check that the broken link checker is installed
if (!function_exists('get_plugins'))
require_once (ABSPATH."wp-admin/includes/plugin.php");
$blc = 'not-found';
foreach(get_plugins() as $plugin_file => $plugin_data) {
if ($plugin_data['Title'] == 'Broken Link Checker') {
if (is_plugin_active($plugin_file)) {
$blc = 'active';
} else {
$blc = 'inactive';
if ($blc == 'inactive') {
$this->notices = $this->notices . "<div class=\"updated fade\"><p><strong>Please activate the Broken Link Checker so that the Open Web Archive can be fully functional.</strong></p></div>";
} else if ($blc == 'not-found') {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archive relies on the <a href=\"http://w-shadow.com/blog/2007/08/05/broken-link-checker-for-wordpress/\">Broken Link Checker</a>. Please install this plugin!</strong></p></div>";
// Check if we have an archive subdirectory
if (!is_dir(dirname(__FILE__) . '/archives')) {
@mkdir(dirname(__FILE__) . '/archives');
if (!is_dir(dirname(__FILE__) . '/archives')) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archive has not been able to create the folder /archives in its installation directory. Please create it by hand and make it writable for the web server.</strong></p></div>";
2011-06-03 18:34:54 +00:00
// Check that we can execute commands
if ( ini_get('disable_functions') ) {
$not_allowed = ini_get('disable_functions');
if ( stristr($not_allowed, 'exec') ) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archives requires that exec() is allowed to run wget and retrieve the pages to archive.</strong></p></div>";
2011-06-03 18:34:54 +00:00
// Check that wget is installed
$output = array();
exec('/usr/bin/wget -V', $output);
2011-06-03 18:34:54 +00:00
if ( empty($output) ) {
$this->notices = $this->notices .
"<div class=\"error fade\"><p><strong>The Open Web Archives is not able to run GNU wget and retrieve the pages to archive. Please check that wget is installed and on the default path.</strong></p></div>";
2011-06-03 18:34:54 +00:00
2012-01-26 21:15:38 +00:00
// We need as least version 1.11 or higher
2011-06-03 18:34:54 +00:00
$helper = preg_match('/GNU Wget ([0-9\.]+) /', $output[0], $wget_version);
if ( $wget_version[0] < '1.11' ) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archives needs GNU wget version 1.11 or higher.</strong><br />Version read: {$wget_version[0]}</p></div>";
2011-06-03 18:34:54 +00:00
if ($this->notices != '') {
add_action('admin_notices', array($this, 'admin_notices'));
* Show admin notices
* @package owark
* @since 0.1
function admin_notices(){
echo $this->notices;
* Admin menus
* @package owark
* @since 0.1
2011-06-03 11:34:24 +00:00
function owark_admin_menu() {
add_management_page(__('The Open Web Archive', 'owark'), __('Web Archive', 'owark'), 'edit_others_posts', 'owark', array($this, 'management_page'));
* URL of an archive page
* @package owark
* @since 0.1
function get_archive_url($archive_id) {
return home_url().'/owark/'.$archive_id;
* Display the admin/tools page.
* @package owark
* @since 0.1
function management_page() {
//must check that the user has the required capability
if (!current_user_can('edit_others_posts')) {
wp_die( __('You do not have sufficient permissions to access this page.') );
global $wpdb;
echo '<div class="wrap">';
echo '<h2>Owark - The Open Web Archive</h2>';
echo '<p><em>Tired of broken links? Archive yours with the Open Web Archive!</em></p>';
echo "</div>";
echo '<p>List of broken links with archived pages:</p>';
2011-06-03 11:34:24 +00:00
$query = "SELECT owark.id, owark.url, owark.status, owark.arc_date, owark.arc_location, blc_links.status_text
2011-06-03 16:44:07 +00:00
FROM {$wpdb->prefix}owark AS owark, {$wpdb->prefix}blc_links as blc_links
2011-06-03 11:34:24 +00:00
WHERE owark.url = blc_links.final_url COLLATE latin1_swedish_ci and blc_links.broken = 1
ORDER BY owark.url";
$results = $wpdb->get_results($query);
echo '<table class="widefat">';
echo '<thead>';
echo '<tr>';
echo '<th>URL</th>';
echo '<th>Archive</th>';
echo '</tr>';
echo '</thead>';
echo '<tbody>';
foreach ($results as $link) {
$archive_url = $this->get_archive_url($link->id);
echo "<tr>
<td><a href=\"{$link->url}\" target='_blank'>{$link->url}</a></td>
<td><a href=\"{$archive_url}\" target='_blank'>{$link->arc_date}</a></td>
echo '</tbody>';
echo '</table>';
* Add a rewrite rule to display archive pages
* @package owark
* @since 0.1
function create_rewrite_rules($rules) {
global $wp_rewrite;
$newRule = array('owark/(.+)' => 'index.php?owark='.$wp_rewrite->preg_index(1));
$newRules = $newRule + $rules;
return $newRules;
* Add a query variable used to display archive pages
* @package owark
* @since 0.1
function add_query_vars($qvars) {
$qvars[] = 'owark';
return $qvars;
* Title says it all ;) ...
* @package owark
* @since 0.1
function flush_rewrite_rules() {
global $wp_rewrite;
* Intercepts archive pages.
* @package owark
* @since 0.1
function template_redirect_intercept() {
global $wp_query;
if ($wp_query->get('owark')) {
* Filter to replace broken links in comments.
* @package owark
* @since 0.1
function content_filter($content) {
global $post;
return $this->link_filter($content, $post->ID, $post->post_type);
* Filter to replace broken links in comments.
* @package owark
* @since 0.1
function comment_filter($content) {
return $this->link_filter($content, get_comment_ID(), 'comment');
* Generic filter to replace broken links in content.
* @package owark
* @since 0.1
function link_filter($content, $post_id, $post_type) {
global $wpdb;
// See if we haven't already loaded the broken links for this post...
if ($this->post_id != $post_id || $this->post_type != $post_type) {
$this->post_id = $post_id;
$this->post_type = $post_type;
//Retrieve info about all occurrences of broken links in the current post
//which happens for comments (they have links to check in 2 different filters)
$q = "
SELECT instances.raw_url, owark.id
FROM {$wpdb->prefix}blc_instances AS instances,
{$wpdb->prefix}blc_links AS links,
2011-06-03 16:44:07 +00:00
{$wpdb->prefix}owark AS owark
2011-06-03 11:34:24 +00:00
instances.link_id = links.link_id
AND owark.url = links.final_url COLLATE latin1_swedish_ci
AND instances.container_id = %s
AND instances.container_type = %s
AND links.broken = 1
$q = $wpdb->prepare($q, $this->post_id, $this->post_type);
$results = $wpdb->get_results($q);
$this->broken_links = array();
foreach ($results as $link) {
$this->broken_links[$link->raw_url] = $link->id;
if (empty($this->broken_links)) {
return $content;
// Regexp : see http://stackoverflow.com/questions/2609095/hooking-into-comment-text-to-add-surrounding-tag
return preg_replace_callback('/(<a.*?href\s*=\s*["\'])([^"\'>]+)(["\'][^>]*>.*?<\/a>)/si', array( $this, 'replace_a_link'), $content);
* Replace a link.
* @package owark
* @since 0.1
function replace_a_link($matches) {
if (array_key_exists($matches[2], $this->broken_links)) {
return $matches[1].$this->get_archive_url($this->broken_links[$matches[2]]).$matches[3];
} else {
return $matches[0];
* Display an archive page
* @package owark
* @since 0.1
function display_archive($parameter) {
global $wpdb;
$id = intval($parameter);
$query = "SELECT *
2011-06-03 16:44:07 +00:00
from {$wpdb->prefix}owark AS owark
2011-06-03 11:34:24 +00:00
where id = {$id}";
$link = $wpdb->get_row($query);
2011-06-03 11:34:24 +00:00
// Find the file to read
2011-06-03 11:34:24 +00:00
$blog_title = get_bloginfo('name');
$home_url = home_url();
$loc = "";
if( ($pos = strpos($link->arc_location, '/archives')) !== FALSE )
$loc = '/wp-content/plugins/owark' . substr($link->arc_location, $pos);
2011-06-03 11:34:24 +00:00
$arc_loc = home_url() . $loc;
// The file name is either index.html or guessed from the URL
if ($home_url[strlen($home_url)] == '/') {
$file_location = '.'. $loc .'/index.html';
} else {
$parts = str_split($home_url, '/');
$file_location = '.'. $loc . $parts[count($parts)] . '.html';
if (!file_exists($file_location)) {
// If index.html doesn't exist, find another html file!
$dir = opendir('.'.$loc);
2014-01-11 21:37:00 +00:00
if ($dir) {
while (false !== ($file = readdir($dir))) {
if ('.html' === substr($file, strlen($file) - 5)) {
$file_location = '.'.$loc.'/' . $file;
2014-01-11 21:37:00 +00:00
// Read the file
2014-01-11 21:37:00 +00:00
if (file_exists($file_location)) {
$f = fopen($file_location, "r");
$content = fread($f, filesize($file_location));
} else {
$content = 'Archive not found';
// Which encoding?
$encoding = $link->encoding;
if ($encoding == NULL) {
// We need to guess the encoding!
$matches = NULL;
// <meta http-equiv="Content-Type" content="text/xml; charset=iso-8859-1"/>
if (preg_match('/<meta\s*http-equiv\s*=\s*["\']Content-Type["\']\s+content\s*=\s*["\'][^"\'>]*charset\s*=\s*([^"\'>]+)\s*["\']/si',
$content, $matches) > 0) {
$encoding = $matches[1];
} else {
$encoding = mb_detect_encoding($content);
if ($encoding) {
array('encoding' => $encoding),
array('id' => $id));
header("Content-Type: text/html; charset=$encoding");
echo '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
<meta http-equiv="Content-Type" content="text/html; charset='.$encoding.'">';
echo "<base href=\"{$arc_loc}/\">";
echo '<div style="background:#fff;border:1px solid #999;margin:-1px -1px 0;padding:0;">';
echo '<div style="background:#ddd;border:1px solid #999;color:#000;font:13px arial,sans-serif;font-weight:normal;margin:12px;padding:8px;text-align:left">';
echo "This is an <a href='http://owark.org'>Open Web Archive</a> archive of <a href=\"{$link->url}\">{$link->url}</a>.";
echo "<br />This snapshot has been taken on {$link->arc_date} for the website <a href=\"{$home_url}\">{$blog_title}</a> which contains a link to this page and has saved a copy to be displayed in the page ever disappears.";
echo '</div></div><div style="position:relative">';
$f = fopen($file_location, "r");
echo $content;
echo '</div>';
2011-06-03 11:34:24 +00:00
* Check if we've got something to archive
* @package owark
* @since 0.1
public static function schedule($occurrences) {
$archiving = get_option( 'owark_archiving', false);
if (! $archiving) {
update_option('owark_archiving', true);
} else {
global $wpdb;
$query = "SELECT DISTINCT final_url from {$wpdb->prefix}blc_links
WHERE final_url NOT IN (SELECT url COLLATE latin1_swedish_ci FROM {$wpdb->prefix}owark)
AND broken=0
AND final_url!=''";
$url = $wpdb->get_row($query);
if ($url != NULL) {
$date = date('c');
$relpath = '/archives/'. str_replace('%2F', '/', urlencode(preg_replace('/https?:\/\//', '', $url->final_url))) . '/' . $date;
$path = dirname(__FILE__).$relpath;
//mkdir($path, $recursive=true);
$output = array();
$status = 0;
2012-01-26 21:15:38 +00:00
exec("wget -t3 -E -H -k -K -p -nd -nv --timeout=60 --user-agent=\"Mozilla/5.0 (compatible; owark/0.1; http://owark.org/)\" -P $path {$url->final_url}",
$output, $status);
$q = $wpdb->insert("{$wpdb->prefix}owark", array(
'url' => $url->final_url,
'status' => $status,
'arc_date' => $date,
'arc_location' => $relpath));
if ($occurrences > 0) {
wp_schedule_single_event(time() + 90, 'owark_schedule_event', array('occurrences' => $occurrences - 1));
2011-06-03 11:34:24 +00:00
2011-06-03 11:34:24 +00:00
if (class_exists("Owark")) {
$owark = new Owark();