<?php

/**
 * Seymour - A feed reading library for PHP
 *
 * <b>Feed_reader.php</b>
 *
 * Defines the feedReader class.
 *
 * <b>LICENSE:</b>
 *
 * This file is part of Seymour.
 *
 * Seymour is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * Seymour is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Seymour; if not, write to the Free Software Foundation, Inc., 51
 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * Included packages such as those from the PEAR library and Simpletest are
 * copyright their respective authors and are subject to their own licenses as
 * specified in their source files.
 *
 * @package		Seymour
 * @subpackage	FeedReader
 * @author		Tom Walter <evilpuppet@users.sourceforge.net>
 * @copyright	Copyright (C) 2005 Tom Walter
 * @license		http://www.gnu.org/licenses/lgpl.txt GNU Lesser General Public
 * License
 * @version		CVS: $Id: feed_reader.php,v 1.1 2005/11/06 12:26:29 evilpuppet Exp $
 * @link		http://seymour.sourceforge.net/
 */


/**
 * Directory containing required PEAR modules.
 *
 * The included pear packages are modified to use this constant as a root path
 * for all includes, since the standard versions expect them to be available in
 * the include_path, but they may not be. Also this avoids conflicts with the
 * standard versions of the packages if they ~do~ exist in the PEAR include
 * path.
 */
define('PEAR_ROOT', dirname(__FILE__).'/pear/');

/**#@+
 * Required PEAR package
 */

/**
 * @see	HTTP_Request
 */
require_once (PEAR_ROOT.'HTTP/Request.php');

/**
 * @see	HTTP_Header
 */
require_once (PEAR_ROOT.'HTTP/Header.php');

/**
 * @see	XML_Unserializer
 */
require_once (PEAR_ROOT.'XML/Unserializer.php');
/**#@-*/

/**
 * Object definitions for XML_Unserializer to map feeds to.
 */
require_once ('feed_mapping.php');


/**
 * Feed Reader class
 *
 * Fetches a feed from a URL and parses it into an object structure. This is the
 * main object you'll need to do most basic feed reading operations.
 *
 * BASIC USAGE:
 * <code>
 * require_once('seymour/feed_reader.php');
 * $feedReader = new feedReader();
 * $feed = $feedReader->getFeed('http://seymour.sourceforge.net/?feed=rss2');
 * echo($feed->getTitle());
 * echo($feed->getDescription(); // etc
 * </code>
 */
class feedReader {

	/**
	 * URL of feed to fetch
	 *
	 * @access	private
	 * @var		string
	 */
	var $url;

	/**
	 * Instance of feed object
	 *
	 * @access	private
	 * @var		object Feed
	 */
	var $feed;

	/**
	 * Raw contents of feed as plain text
	 *
	 * @access	public
	 * @var		string
	 */
	var $rawFeed;

	/**
	 * Feed has been modified since it was last accessed. (Based on the
	 * $ifModifiedSince param passed to the getFeed function).
	 *
	 * @access	public
	 * @var		bool
	 * @see		getFeed()
	 */
	var $modified;

	/**
	 * Last modified date returned by feed's webserver.
	 *
	 * @access	public
	 * @var		string
	 */
	var $lastModified;

	/**
	 * There was an error fetching or parsing the feed.
	 *
	 * @access	public
	 * @var		bool
	 */
	var $error;

	/**
	 * Error code
	 *
	 * @access	public
	 * @var		string
	 */
	var $errorCode;

	/**
	 * Error description
	 *
	 * @access	public
	 * @var		string
	 */
	var $errorMsg;

	/**#@+
	 * Instances of PEAR package objects
	 */

	/**
	 * @access	private
	 * @var		object HTTP_Request
	 */
	var $httpRequest;

	/**
	 * @access	private
	 * @var		object HTTP_Header
	 */
	var $httpHeader;

	/**
	 * @access	private
	 * @var		object XML_Unserializer
	 */
	var $parser;

	/**#@-*/

	/**
	 * Constructor
	 *
	 * Creates a feedReader with the default options
	 *
	 * @access	public
	 * @return	void
	 */
	function feedReader() {

		// see HTTP_Request docs for more info on these settings
		$this->httpRequest = new HTTP_Request('');
		$this->httpRequest->setHttpVer(HTTP_REQUEST_HTTP_VER_1_1); // typepad for some reason 404s if you use HTTP 1.0, so don't
		$this->setConnTimeout(3);
		$this->setReadTimeout(10);
		$this->setMaxRedirects(3);

		$this->httpHeader = new HTTP_Header();

		// see XML_Unserializer docs for more info on these settings
		$this->parser = new XML_Unserializer();
		$this->parser->setOption('complexType', 'object');
		$this->parser->setOption('tagAsClass', true);
		$this->parser->setOption('parseAttributes', true);
		$this->parser->setOption('forceEnum', array ('entry', 'item', 'link', 'author', 'managingEditor', 'webMaster', 'contributor', 'dc_creator', 'dc_contributor', 'category', 'dc_subject'));
        //		$this->parser->setOption('targetEncoding', 'UTF-8');
		$this->parser->setOption('skipElWithAttr', array ('type' => array ('xhtml', 'application/xhtml+xml'), 'mode' => array ('xml')));

		$this->clear();
	}

	/**
	 * Clear details of any feed that has already been fetched.
	 *
	 * @access	private
	 * @return	void
	 */
	function clear() {
		$this->url = null;
		$this->feed = null;
		$this->rawFeed = null;
		$this->modified = null;
		$this->lastModified = null;
		$this->error = null;
		$this->errorCode = null;
		$this->errorMsg = null;
		$this->httpRequest->_redirects = 0;
	}

	/**
	 * Set connection timeout.
	 *
	 * How long to wait for web server to respond. Default = 3 seconds.
	 *
	 * @access	public
	 * @param	int $timeout Time out in seconds, set to null for infinite.
	 * @return	void
	 */
	function setConnTimeout($timeout) {
		if ($timeout <= 0)
			$timeout = null;
		$this->httpRequest->_timeout = $timeout;
	}

	/**
	 * Set read timeout.
	 *
	 * How long to wait for a feed to download. Default = 10 seconds.
	 *
	 * @access	public
	 * @param	int $timeout Time out in seconds, set to null for infinite.
	 * @return	void
	 */
	function setReadTimeout($timeout) {
		if ($timeout < 0) {
			$timeout = null;
		} else {
			$timeout = array ($timeout, 0);
		}
		$this->httpRequest->_readTimeout = $timeout;
	}

	/**
	 * Set maximum redirections.
	 *
	 * This is the number of times that we will follow redirects from the web
	 * server if a feed has been moved (HTTP 3xx codes). Default = 3.
	 *
	 * @access	public
	 * @param	int $redirects Number of redirects.
	 * @return	void
	 */
	function setMaxRedirects($redirects) {
		$this->httpRequest->_allowRedirects = ($redirects > 0);
		$this->httpRequest->_maxRedirects = $redirects;
	}

	/**
	 * Set proxy server details.
	 *
	 * @access	public
	 * @param	string	$host	Host name/IP
	 * @param	int 	$port	Port number
	 * @param	string	$user	User name
	 * @param	string	$pass	Password
	 * @return	void
	 */
	function setProxy($host, $port = 8080, $user = null, $pass = null) {
		$this->httpRequest->setProxy($host, $port, $user, $pass);
	}

	/**
	 * Get a feed and parse it.
	 *
	 * In most cases this is the only function you need.
	 *
	 * @access	public
	 * @param	string	$url				Absolute URL of feed
	 * @param	string	$ifModifiedSince	Only return feed if modified since
	 * this date. Should be in
	 * {@link http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1 RFC
	 * 1123 format}, but it's best to use the exact string this web server has
	 * returned previously if that's available. Default = null (always returns
	 * feed regardless of date).
	 * @return	mixed	Feed object, or null if error or unmodified
	 *
	 * @see	Feed
	 */
	function getFeed($url, $ifModifiedSince = null) {
		if (!is_null($this->fetchFeed($url, $ifModifiedSince))) {
			return $this->parseFeed();
		} else {
			return null;
		}
	}

	/**
	 * Fetch a RAW feed.
	 *
	 * Use this only if you want to get the plain text contents of the feed,
	 * without parsing into the object structure.
	 *
	 * @access	public
	 * @param	string	$url				Absolute URL of feed
	 * @param	string	$ifModifiedSince	Only return feed if modified since
	 * this date. Should be in
	 * {@link http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1 RFC
	 * 1123 format}, but it's best to use the exact string this web server has
	 * returned previously if that's available. Default = null (always returns
	 * feed regardless of date).
	 * @return	mixed	Feed as string, or null if error or unmodified
	 */
	function fetchFeed($url, $ifModifiedSince = null) {

		$this->clear();

		$this->url = trim($url);

		if (empty ($this->url)) {
			$this->setError('BLANKURL', 'No URL given.');
			return null;
		}
		elseif (!$this->isHttpUrl($this->url)) {
			$this->setError('BADURL', 'Invalid URL.');
			return null;
		}

		$this->httpRequest->setURL($this->url);

		if ($ifModifiedSince) {
			$this->httpRequest->addHeader('If-Modified-Since', $ifModifiedSince);
		}

		$connResult = $this->httpRequest->sendRequest();

		if (!PEAR :: isError($connResult)) {
			$status = $this->httpRequest->getResponseCode();

			if ($this->httpHeader->isError($status)) {

				$this->setError("HTTP $status", "HTTP Error $status: ".$this->httpHeader->getStatusText($status));
				return null;

			}
			elseif ($this->isNotModified($status)) {

				$this->modified = false;
				return null;

			} else {

				// we successfully fetched and this is new info, so do something with it

				$this->modified = true;
				$this->lastModified = $this->httpRequest->getResponseHeader('Last-Modified');
				$this->rawFeed = $this->httpRequest->getResponseBody();
				return $this->rawFeed;

			}

		} else {

			// we do a bit of munging to make usable error messages out of those given by HTTP_Request
			// keep an eye on these coz it might change with newer versions of the pear modules
			// also I'm not sure that these are the only cases in which these errors are thrown!
			if (($connResult->getCode() == '') && ($connResult->getMessage() == 'Malformed response.')) {
				// this actually occurs when the connection or read has timed out
				$this->setError('TIMEOUT', 'Connection/read timeout.');
			}
			elseif (($connResult->getCode() == 0) && ($connResult->getMessage() == "The operation completed successfully.\r\n")) {
				// this occurs when the domain cannot be found
				$this->setError('BADDOMAIN', 'Domain not found.');
			}
			elseif (($connResult->getCode() == 0) && ($connResult->getMessage() == 'Too many redirects')) {
				// this occurs when the domain cannot be found
				$this->setError('MAXREDIRECT', 'Too many redirects.');
			} else {
				$this->setError('CONN '.$connResult->getCode(), 'Connection Error: '.$connResult->getMessage());
			}

			return null;

		}

	}

	/**
	 * Parses a plain text version of a feed.
	 *
	 * Parses into the object structure specified in feed_mapping.php.
	 *
	 * @access	public
	 * @param	string	$rawFeed Defaults to feed previously returned by
	 * fetchFeed() if that exists
	 * @return	mixed	Feed object, or null if error
	 *
	 * @see	feed_mapping.php
	 */
	function parseFeed($rawFeed = null) {

		if (is_null($rawFeed))
			$rawFeed = $this->rawFeed;

		if (empty ($rawFeed) || is_null($rawFeed)) {
			$this->setError('EMPTYFEED', 'Feed was empty.');
			return null;
		}

		$result = $this->parser->unserialize($rawFeed);

		if (!PEAR :: isError($result)) {
			$this->feed = $this->parser->getUnserializedData();
			return $this->feed;
		} else {
			$this->setError('PARSE '.$result->getCode(), 'Parse Error: '.$result->getMessage());
			return null;
		}

	}

	/**
	 * Register an error
	 *
	 * @access	private
	 * @param	string	$code	Error code
	 * @param	string	$msg	Error description
	 * @return	void
	 */
	function setError($code, $msg) {
		$this->error = true;
		$this->errorCode = $code;
		$this->errorMsg = $msg;
		//trigger_error($msg, E_USER_NOTICE); // this freaks out the simpletest script, any suggestions on how to stop that welcomed...
	}

	/**
	 * Determine if an response code is an HTTP not modified
	 *
	 * @access	private
	 * @param	int $code	HTTP response code
	 * @return	bool 		True if not modified
	 */
	function isNotModified($code) {
		return $code == 304;
	}

	/**
	 * Check for valid HTTP Url
	 *
	 * Known weaknesses:
	 * - allows invalid ips (ones with less than 4 sets of numbers, and with
	 * numbers over 256)
	 *
	 * @access	private
	 * @param	string	$url	URL to test
	 * @return	bool			True if a valid HTTP URL
	 */
	function isHttpUrl($url) {
		$url = (string) $url;
		$reURL = "/^(http|https):\/\/(\w+(:\w+)?@)?[\w\-]+(\.[\w\-]+)*(\.[a-zA-Z]{2,6})?(:\d+)?(\/([^\s\\:\*\?\"\<\>\|]*)?(\?([\w\.\+%]+=[\w\.\+%]+)(&[\w\.\+%]+=[\w\.\+%]+)*)?)?$/";
		return preg_match($reURL, $url);
	}

}
?>