owark/owark.php

709 lines
20 KiB
PHP

<?php
/* Copyright 2011-2020 Eric van der Vlist (vdv@dyomedea.com)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2, as
published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
Plugin Name: owark
Plugin URI: http://owark.org
Description: Tired of broken links? Archive yours with owark, the Open Web Archive!
Version: 0.2
Author: Eric van der Vlist
Author URI: http://eric.van-der-vlist.com
License: GLP2
*/
/*
Log
*/
if ( ! function_exists('print_r_log')) {
function print_r_log ( $log ) {
$caller_strace = debug_backtrace()[1];
if ( is_array( $log ) || is_object( $log ) ) {
error_log($caller_strace['file'] . '/#' .$caller_strace['line'] . ':');
error_log( print_r( $log, true ) );
} else {
error_log($caller_strace['file'] . '/#' .$caller_strace['line'] . ': ' . $log );
}
}
}
if ( ! function_exists('log_function_call')) {
function log_function_call () {
$caller_strace = debug_backtrace()[1];
error_log((isset($caller_strace['file']) ? $caller_strace['file'] : '<undefined>') . '/#' . (isset($caller_strace['line']) ? $caller_strace['line'] : '<undefined>') . ' function: ' . (isset($caller_strace['function']) ? $caller_strace['function'] : '<undefined>') . '(');
foreach ($caller_strace['args'] as $arg) {
error_log(' * ' . gettype($arg) . ': ' . print_r( $arg, true ));
}
}
}
if ( ! function_exists('archives_dir')) {
function archives_dir() {
if (defined('OWARK_ARCHIVES_REL_PATH')) {
return WP_CONTENT_DIR . '/' . OWARK_ARCHIVES_REL_PATH;
} else {
return WP_PLUGIN_DIR . '/archives';
}
}
}
if ( ! function_exists('archives_url')) {
function archives_url() {
if (defined('OWARK_ARCHIVES_REL_PATH')) {
return WP_CONTENT_URL . '/' . OWARK_ARCHIVES_REL_PATH;
} else {
return WP_PLUGIN_URL . '/archives';
}
}
}
if (!class_exists("Owark")) {
class Owark {
private $broken_links = array();
private $post_id = -1;
private $post_type = "";
private $version = '0.2';
private $db_version = '0.2';
private $notices = "";
/**
* Class constructor
*
* @package owark
* @since 0.1
*
*
*/
public function __construct() {
if (is_admin()) {
add_action('admin_menu', array($this, 'owark_admin_menu'));
add_action('plugins_loaded', array($this, 'sanity_checks'));
}
// See http://stackoverflow.com/questions/2210826/need-help-with-wp-rewrite-in-a-wordpress-plugin
// Using a filter instead of an action to create the rewrite rules.
// Write rules -> Add query vars -> Recalculate rewrite rules
add_filter('rewrite_rules_array', array($this, 'create_rewrite_rules'));
add_filter('query_vars',array($this, 'add_query_vars'));
// Recalculates rewrite rules during admin init to save resources.
// Could probably run it once as long as it isn't going to change or check the
// $wp_rewrite rules to see if it's active.
add_filter('admin_init', array($this, 'flush_rewrite_rules'));
add_action( 'template_redirect', array($this, 'template_redirect_intercept') );
add_filter ( 'the_content', array($this, 'content_filter'), 20);
add_filter ( 'comment_text', array($this, 'comment_filter'), 20, 2);
add_filter ( 'get_comment_author_url', array($this, 'comment_author_url_filter'), 20, 1);
add_action('owark_schedule_event', array('Owark', 'schedule'), 10, 2);
if ( !wp_next_scheduled( 'owark_schedule_event', array('occurrences' => 30, 'version' => $this->version) ) ) {
wp_schedule_event(time(), 'hourly', 'owark_schedule_event', array('occurrences' => 30, 'version' => $this->version));
}
}
function Owark()
{
// PHP4-style constructor.
// This will NOT be invoked, unless a sub-class that extends `foo` calls it.
// In that case, call the new-style constructor to keep compatibility.
self::__construct();
}
function archives_dir() {
if (defined('OWARK_ARCHIVES_DIR')) {
return OWARK_ARCHIVES_DIR;
} else {
return dirname(__FILE__) . '/archives';
}
}
/**
* Check we have everything we need...
*
* @package owark
* @since 0.1
*
*
*/
function sanity_checks(){
// Install or upgrade tables if needed
$installed_ver = get_option( "owark_db_version" );
global $wpdb;
$table = $wpdb->prefix."owark";
if ($installed_ver == '0.1') {
// In version 0.1 final URLs where used but the broken link checkers update these URLs when a link is detected broken
$table_links = $wpdb->prefix."blc_links";
$sql = "update $table as owark join $table_links as links on owark.url = links.final_url COLLATE latin1_swedish_ci set owark.url = links.url COLLATE latin1_swedish_ci";
$wpdb->query($sql);
$installed_ver = '0.2';
update_option( "owark_db_version", $installed_ver );
}
if ($installed_ver != $this->db_version) {
print_r_log("Database upgrade from $installed_ver to {$this->version}");
$sql = "CREATE TABLE $table (
id int(10) unsigned NOT NULL AUTO_INCREMENT,
url text NOT NULL,
status varchar(20) NOT NULL DEFAULT 'to-archive',
arc_date datetime,
arc_location text,
encoding varchar(10),
PRIMARY KEY(`id`),
KEY `url` (`url`(150)) )";
require_once(ABSPATH . 'wp-admin/includes/upgrade.php');
dbDelta($sql);
update_option( "owark_db_version", $this->db_version );
$this->notices = "<div class=\"updated fade\"><p><strong>The owark table has been installed or upgraded to version {$this->db_version}</strong></p></div>";
}
// Check that the broken link checker is installed
if (!function_exists('get_plugins'))
require_once (ABSPATH."wp-admin/includes/plugin.php");
$blc = 'not-found';
foreach(get_plugins() as $plugin_file => $plugin_data) {
if ($plugin_data['Title'] == 'Broken Link Checker') {
if (is_plugin_active($plugin_file)) {
$blc = 'active';
} else {
$blc = 'inactive';
}
}
}
if ($blc == 'inactive') {
$this->notices = $this->notices . "<div class=\"updated fade\"><p><strong>Please activate the Broken Link Checker so that the Open Web Archive can be fully functional.</strong></p></div>";
} else if ($blc == 'not-found') {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archive relies on the <a href=\"http://w-shadow.com/blog/2007/08/05/broken-link-checker-for-wordpress/\">Broken Link Checker</a>. Please install this plugin!</strong></p></div>";
}
// Check if we have an archive subdirectory
$archives_dir = archives_dir();
print_r_log($archives_dir);
if (!is_dir($archives_dir)) {
@mkdir($archives_dir);
if (!is_dir($archives_dir)) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archive has not been able to create the folder /archives in its installation directory. Please create it by hand and make it writable for the web server.</strong></p></div>";
}
} elseif (! is_writable($archives_dir)) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archive needs a writable folder /archives in its installation directory. Please make it writable for the web server.</strong></p></div>";
}
// Check that we can execute commands
if ( ini_get('disable_functions') ) {
$not_allowed = ini_get('disable_functions');
if ( stristr($not_allowed, 'exec') ) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archives requires that exec() is allowed to run wget and retrieve the pages to archive.</strong></p></div>";
}
}
// Check that wget is installed
$output = array();
exec('/usr/bin/wget -V', $output);
if ( empty($output) ) {
$this->notices = $this->notices .
"<div class=\"error fade\"><p><strong>The Open Web Archives is not able to run GNU wget and retrieve the pages to archive. Please check that wget is installed and on the default path.</strong></p></div>";
}
// We need as least version 1.11 or higher
$helper = preg_match('/GNU Wget ([0-9\.]+) /', $output[0], $wget_version);
if ( $wget_version[0] < '1.11' ) {
$this->notices = $this->notices . "<div class=\"error fade\"><p><strong>The Open Web Archives needs GNU wget version 1.11 or higher.</strong><br />Version read: {$wget_version[0]}</p></div>";
}
if ($this->notices != '') {
add_action('admin_notices', array($this, 'admin_notices'));
}
}
/**
* Show admin notices
*
* @package owark
* @since 0.1
*
*
*/
function admin_notices(){
echo $this->notices;
}
/**
* Admin menus
*
* @package owark
* @since 0.1
*
*
*/
function owark_admin_menu() {
add_management_page(__('The Open Web Archive', 'owark'), __('Web Archive', 'owark'), 'edit_others_posts', 'owark', array($this, 'management_page'));
}
/**
* URL of an archive page
*
* @package owark
* @since 0.1
*
*
*/
function get_archive_url($archive_id) {
return home_url().'/owark/'.$archive_id;
}
/**
* Display the admin/tools page.
*
* @package owark
* @since 0.1
*
*
*/
function management_page() {
//must check that the user has the required capability
if (!current_user_can('edit_others_posts')) {
wp_die( __('You do not have sufficient permissions to access this page.') );
}
global $wpdb;
echo '<div class="wrap">';
screen_icon();
echo '<h2>Owark - The Open Web Archive</h2>';
echo '<p><em>Tired of broken links? Archive yours with the Open Web Archive!</em></p>';
echo "</div>";
echo '<p>List of broken links with archived pages:</p>';
$query = "SELECT owark.id, owark.url, owark.status, owark.arc_date, owark.arc_location, blc_links.status_text
FROM {$wpdb->prefix}owark AS owark, {$wpdb->prefix}blc_links as blc_links
WHERE owark.url = blc_links.final_url COLLATE latin1_swedish_ci and blc_links.broken = 1
ORDER BY owark.url";
$results = $wpdb->get_results($query);
echo '<table class="widefat">';
echo '<thead>';
echo '<tr>';
echo '<th>URL</th>';
echo '<th>Archive</th>';
echo '</tr>';
echo '</thead>';
echo '<tbody>';
foreach ($results as $link) {
$archive_url = $this->get_archive_url($link->id);
echo "<tr>
<td><a href=\"{$link->url}\" target='_blank'>{$link->url}</a></td>
<td><a href=\"{$archive_url}\" target='_blank'>{$link->arc_date}</a></td>
</tr>";
}
echo '</tbody>';
echo '</table>';
}
/**
* Add a rewrite rule to display archive pages
*
* @package owark
* @since 0.1
*
*
*/
function create_rewrite_rules($rules) {
global $wp_rewrite;
$newRule = array('owark/(.+)' => 'index.php?owark='.$wp_rewrite->preg_index(1));
$newRules = $newRule + $rules;
return $newRules;
}
/**
* Add a query variable used to display archive pages
*
* @package owark
* @since 0.1
*
*
*/
function add_query_vars($qvars) {
$qvars[] = 'owark';
return $qvars;
}
/**
* Title says it all ;) ...
*
* @package owark
* @since 0.1
*
*
*/
function flush_rewrite_rules() {
global $wp_rewrite;
$wp_rewrite->flush_rules();
}
/**
* Intercepts archive pages.
*
* @package owark
* @since 0.1
*
*
*/
function template_redirect_intercept() {
global $wp_query;
if ($wp_query->get('owark')) {
$this->display_archive($wp_query->get('owark'));
exit;
}
}
/**
* Filter to replace broken links in comments.
*
* @package owark
* @since 0.1
*
*
*/
function content_filter($content) {
log_function_call();
global $post;
return $this->link_filter($content, $post->ID, $post->post_type);
}
/**
* Filter to replace broken links in comments.
*
* @package owark
* @since 0.1
*
*
*/
function comment_filter($content, $comment) {
log_function_call();
return $this->link_filter($content, $comment->comment_ID, 'comment');
}
/**
* Filter to replace broken links in author links.
*
* @package owark
* @since 0.2
*
*
*/
function comment_author_url_filter($url) {
log_function_call();
global $wpdb;
$q = "
SELECT owark.id
FROM {$wpdb->prefix}blc_instances AS instances,
{$wpdb->prefix}blc_links AS links,
{$wpdb->prefix}owark AS owark
WHERE
instances.link_id = links.link_id
AND owark.url = %s
AND owark.url = instances.raw_url
AND links.broken = 1
";
print_r_log($q);
$q = $wpdb->prepare($q, $url);
$results = $wpdb->get_results($q);
print_r_log($results);
if (empty($results)) {
return $url;
} else {
return $this->get_archive_url($results[0]->id);
}
}
/**
* Generic filter to replace broken links in content.
*
* @package owark
* @since 0.1
*
*
*/
function link_filter($content, $post_id, $post_type) {
global $wpdb;
log_function_call();
// See if we haven't already loaded the broken links for this post...
if ($this->post_id != $post_id || $this->post_type != $post_type) {
$this->post_id = $post_id;
$this->post_type = $post_type;
//Retrieve info about all occurrences of broken links in the current post
//which happens for comments (they have links to check in 2 different filters)
$q = "
SELECT instances.raw_url, owark.id
FROM {$wpdb->prefix}blc_instances AS instances,
{$wpdb->prefix}blc_links AS links,
{$wpdb->prefix}owark AS owark
WHERE
instances.link_id = links.link_id
AND owark.url = instances.raw_url
AND instances.container_id = %s
AND instances.container_type = %s
AND links.broken = 1
";
print_r_log($q);
$q = $wpdb->prepare($q, $this->post_id, $this->post_type);
$results = $wpdb->get_results($q);
print_r_log($results);
$this->broken_links = array();
foreach ($results as $link) {
$this->broken_links[$link->raw_url] = $link->id;
}
}
if (empty($this->broken_links)) {
return $content;
}
// Regexp : see http://stackoverflow.com/questions/2609095/hooking-into-comment-text-to-add-surrounding-tag
$replaced = preg_replace_callback('/(<a.*?href\s*=\s*["\'])([^"\'>]+)(["\'][^>]*>.*?<\/a>)/si', array( $this, 'replace_a_link'), $content);
print_r_log("replaced: $replaced");
return $replaced;
}
/**
* Replace a link.
*
* @package owark
* @since 0.1
*
*
*/
function replace_a_link($matches) {
log_function_call();
if (array_key_exists($matches[2], $this->broken_links)) {
return $matches[1].$this->get_archive_url($this->broken_links[$matches[2]]).$matches[3];
} else {
return $matches[0];
}
}
/**
* Display an archive page
*
* @package owark
* @since 0.1
*
*
*/
function display_archive($parameter) {
print_r_log("display_archive($parameter)");
global $wpdb;
$id = intval($parameter);
$blog_title = get_bloginfo('name');
$home_url = home_url();
$query = "SELECT *
from {$wpdb->prefix}owark AS owark
where id = {$id}";
$link = $wpdb->get_row($query);
$wpdb->flush();
$loc = archives_dir() . '/' . $link->arc_location;
$arc_base = archives_url() . '/' . $link->arc_location;
// The file name is either index.html or guessed from the URL
if ($link->url[strlen($link->url) - 1] == '/') {
$file_location = $loc .'/index.html';
} else {
$parts = explode($link->url, '/');
$file_location = $loc . $parts[count($parts) - 1] . '.html';
}
if (!file_exists($file_location)) {
// If index.html doesn't exist, find another html file!
$dir = opendir($loc);
if ($dir) {
while (false !== ($file = readdir($dir))) {
if ('.html' === substr($file, strlen($file) - 5)) {
$file_location = $loc.'/' . $file;
break;
}
}
closedir($dir);
}
}
// Read the file
if (file_exists($file_location)) {
$f = fopen($file_location, "r");
$content = fread($f, filesize($file_location));
fclose($f);
} else {
$content = 'Archive not found';
}
// Which encoding?
$encoding = $link->encoding;
if ($encoding == NULL) {
// We need to guess the encoding!
$matches = NULL;
// <meta http-equiv="Content-Type" content="text/xml; charset=iso-8859-1"/>
if (preg_match('/<meta\s*http-equiv\s*=\s*["\']Content-Type["\']\s+content\s*=\s*["\'][^"\'>]*charset\s*=\s*([^"\'>]+)\s*["\']/si',
$content, $matches) > 0) {
$encoding = $matches[1];
} else {
$encoding = mb_detect_encoding($content);
}
if ($encoding) {
$wpdb->update(
"{$wpdb->prefix}owark",
array('encoding' => $encoding),
array('id' => $id));
}
}
header("Content-Type: text/html; charset=$encoding");
echo '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<meta http-equiv="Content-Type" content="text/html; charset='.$encoding.'">';
echo "<base href=\"{$arc_base}/\">";
echo '<div style="background:#fff;border:1px solid #999;margin:-1px -1px 0;padding:0;">';
echo '<div style="background:#ddd;border:1px solid #999;color:#000;font:13px arial,sans-serif;font-weight:normal;margin:12px;padding:8px;text-align:left">';
echo "This is an <a href='http://owark.org'>Open Web Archive</a> archive of <a href=\"{$link->url}\">{$link->url}</a>.";
echo "<br />This snapshot has been taken on {$link->arc_date} for the website <a href=\"{$home_url}\">{$blog_title}</a> which contains a link to this page and has saved a copy to be displayed in the page ever disappears.";
echo '</div></div><div style="position:relative">';
print_r_log("file_location: $file_location");
$f = fopen($file_location, "r");
echo $content;
echo '</div>';
}
/**
* Check if we've got something to archive
*
* @package owark
* @since 0.1
*
*
*/
public static function schedule($occurrences, $version) {
log_function_call();
$archiving = get_option( 'owark_archiving', false);
if (! $archiving) {
update_option('owark_archiving', true);
} else {
return;
}
global $wpdb;
$query = "
SELECT DISTINCT instances.raw_url as url
from {$wpdb->prefix}blc_links as links,
{$wpdb->prefix}blc_instances AS instances
WHERE url NOT IN (SELECT url FROM {$wpdb->prefix}owark)
AND broken=0
AND last_check is not null
AND instances.link_id = links.link_id";
print_r_log("query: $query");
$url = $wpdb->get_row($query);
print_r_log($url);
$wpdb->flush();
if ($url != NULL) {
$date = date('c');
$relpath = str_replace('%2F', '/', urlencode(preg_replace('/https?:\/\//', '', $url->url))) . '/' . $date;
$path = archives_dir()."/$relpath";
//mkdir($path, $recursive=true);
$output = array();
$status = 0;
exec("wget -t3 -E -H -k -K -p -nd -nv --timeout=60 --user-agent=\"Mozilla/5.0 (compatible; owark/$version; http://owark.org/)\" -P $path {$url->url} 2>&1",
$output, $status);
print_r_log("wget status: $status");
if ($status != 0) {
print_r_log("wget status: $status, output:");
print_r_log($output);
}
$q = $wpdb->insert("{$wpdb->prefix}owark", array(
'url' => $url->url,
'status' => $status,
'arc_date' => $date,
'arc_location' => $relpath));
if ($occurrences > 0) {
wp_schedule_single_event(time() + 90, 'owark_schedule_event', array('occurrences' => $occurrences - 1, 'version' => $version));
}
}
delete_option('owark_archiving');
}
}
}
if (class_exists("Owark")) {
$owark = new Owark();
}
?>