<?php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
/*
 * Copyright 2004-2006 Project Guarana Development Team
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * @package ficus.ext.wikipedia
 */
/**
 * @file WikipediaArticle.php
 * @brief
 * @author <a href="mailto:kent@guarana.cc">ISHITOYA Kentaro</a>
 * @version $Id: PDODatabase.php 26 2007-08-01 13:56:06Z ishitoya $
 * 
 *
 */

require_once "Text/Wiki.php";
require_once "ficus/beans/Bean.php";

/**
 * @class Ficus_WikipediaArticle
 */
class Ficus_WikipediaArticle extends Ficus_Bean{
    const FORMAT_XHTML = "Xhtml";
    const FORMAT_PLAIN = "Plain";
    const PARSER = "Mediawiki";
    const JA_WIKI = "http://ja.wikipedia.org/wiki/%s";
    
    protected $title = null;
    protected $text = null;
    protected $baseUrl = self::JA_WIKI;
    protected $filters = array();

    /**
     * get text as Xhtml
     * @return string xhtml text
     */
    public function asXhtml(){
        $textWiki = Text_Wiki::singleton(self::PARSER);
        $textWiki->setFormatConf(self::FORMAT_XHTML,
                                 'translate', 'HTML_SPECIALCHARS');
        $textWiki->setRenderConf(self::FORMAT_XHTML, "Wikilink",
                                'view_url', $this->baseUrl);
        $textWiki->setRenderConf(self::FORMAT_XHTML, "Wikilink",
                                 'exists_callback',
                                 array($this, "pageExists"));
        $text = $textWiki->transform($this->text, self::FORMAT_XHTML);
        return $this->filter($text);
    }

    /**
     * get text as plain
     * @return string plain text
     */
    public function asPlain(){
        $textWiki = Text_Wiki::singleton(self::PARSER);
        $textWiki->setFormatConf(self::FORMAT_PLAIN,
                                 'translate', 'HTML_SPECIALCHARS');
        $textWiki->setParseConf("Wikilink",
                                array('view_url' => $this->baseUrl));
        $text = $textWiki->transform($this->text, self::FORMAT_PLAIN);
        return $this->filter($text);
    }

    /**
     * summarize
     * @return Ficus_WikipediaArticle summarized article
     */
    public function summarize($length = 1000){
        $text = mb_strcut($this->text, 0, $length, "UTF-8");
        $article = $this->createClone();
        return $article->setText($text);
    }

    /**
     * get links
     */
    public function getLinks(){
        $tokens = $this->getTokens(array("Wikilink", 'Url'));
        $bean = new Ficus_ConcreteBean(array("title" => null,
                                             "link" => null));
        $aggregator = new Ficus_ConcreteBeanAggregator();
        foreach($tokens as $token){
            $tokenBean = $bean->createClone();
            if($token[0] == "Wikilink"){
                $tokenBean->setTitle($token[1]["page"]);
                $url = sprintf($this->baseUrl, urlencode($token[1]["text"]));
                $tokenBean->setlink($url);
                $aggregator->addBeans($tokenBean);
            }
        }
        return $aggregator;
    }

    /**
     * get tokens
     */
    public function getTokens($rule = null){
        $textWiki = Text_Wiki::singleton(self::PARSER);
        $textWiki->parse($this->text);
        if(is_null($rule)){
            return $textWiki->getTokens();
        }else{
            return $textWiki->getTokens($rule);
        }
    }
    
    /**
     * remove thumbnail tag e.g. "[[...|thumb|...]]"
     */
    private function removeThumbTag($text){
	if($thumbPos = mb_strpos($text,"thumb",0,"UTF-8")){
	    $frontStr = mb_substr($text,0,$thumbPos,"UTF-8");
	    $lparenPos = mb_strrpos($frontStr,"[[",0,"UTF-8");
	    $endStr = mb_substr($text,$thumbPos,mb_strlen($text,"UTF-8")-$thumbPos,"UTF-8");
	    $rparenPos = mb_strpos($endStr,"]]",0,"UTF-8");
	    $sndLparenPos = mb_strpos($endStr,"[[",0,"UTF-8");
	    if($rparenPos > $sndLparenPos){
	    	$rparenPos = mb_strpos($endStr,"]]",$rparenPos+2,"UTF-8");
	    }
		return $this->removeThumbTag(mb_substr($text,0,$lparenPos,"UTF-8")
			.mb_substr($text,$thumbPos+$rparenPos+2,mb_strlen($text,"UTF-8")-$thumbPos-$rparenPos+2,"UTF-8"));
    }else{
    	return $text;
    }
} 
    /**
     * remove Paren Tag e.g. "{{ hoge }}"
     */
    private function removeParenTag($text,$lparen,$rparen,$count){
	 	$head = "";
		$beginPos = mb_strpos($text,$lparen);
		if($beginPos === false)$beginPos = mb_strlen($text);
		$endPos = mb_strpos($text,$rparen);
		if($endPos === false) return $text;
		
		if($beginPos > $endPos){
			$count--;
			$tmpStr = mb_substr($text,$endPos+mb_strlen($rparen),mb_strlen($text)-$endPos-mb_strlen($rparen));
		}else{
			if($count==0){
				$head = mb_substr($text,0,$beginPos);
			}
			$count++;
			$tmpStr = mb_substr($text,$beginPos+mb_strlen($lparen),mb_strlen($text)-$beginPos-mb_strlen($lparen));
		}
		if(empty($head)){
			return $this->removeParenTag($tmpStr,$lparen,$rparen,$count);
		}else{
			return $head.$this->removeParenTag($tmpStr,$lparen,$rparen,$count);
		}
	}
    
	/**
	 * remove extra Paren
	 */
	private function removeExtraParen($text,$lparen,$rparen){
		$lparenPos = mb_strrpos($text,$lparen);
		$rparenPos = mb_strrpos($text,$rparen);
		if($lparenPos > $rparenPos){
			$text = mb_substr($text,0,$lparenPos);
		}
		return $text;
	}
	/**
	 * remove last (wrong) Abstruct Tag
	 */
	private function removeWrongAbstTag($text){
		mb_ereg_search_init($text);
		$count = 0; 
		while($results =  mb_ereg_search_pos("===?")){
			++$count;
		}
		if($count%2==1){
			$text = mb_substr($text,0,mb_strrpos($text,"==")-1);
		}
		return $text;
	}
	/**
	 * removeWrongLastCharacter
	 */
	private function removeWrongLastChar($text){
		$charsize = 1;
	 	mb_ereg_search_init(mb_substr($text,mb_strlen($text)-$charsize,$charsize));
		while($result = mb_ereg_search_pos("「|“|‘|\(|〈|（|《|〔|『|\{|【|\[|\'|\"|｜|=|　|\| |\n]")){
	 		if($result){
	  			$text = mb_substr($text,0,mb_strlen($text)-$charsize);
	  			mb_ereg_search_init(mb_substr($text,mb_strlen($text)-$charsize,$charsize));
	  		}
 		}
 		return $text;
	} 
    /**
     * smartSummarize
     * @return Ficus_WikipediaArticle summarize article
     */
    public function smartSummarize($length = 500){
        if($this->text === null){
            throw new Ficus_IllegalArgumentException("text is null");
        }
        $text = $this->removeThumbTag($this->text);
        $text = $this->removeParenTag($text,"{{","}}",0);
        $text = $this->removeParenTag($text,"{","}",0);
        $begin = mb_strpos($text, "== 概要 ==", 0, "UTF-8");
        if($begin === false){
        	$begin = mb_strpos($text,"'''",0,"UTF-8");
        }
	    $text = mb_substr($text,$begin,$length,"UTF-8");
 		$text = $this->removeExtraParen($text,"[[","]]");
 		$text = $this->removeWrongAbstTag($text);
 		$text = $this->removeWrongLastChar($text);
 		$text .= "</ br>";
        $article = $this->createClone();
        return $article->setText($text);
    }
	
    /**
     * exists callback
     */
    public function pageExists($page){
        return true; //always page exist
    }
    
    /**
     * apply filter
     * @param $text string target text
     * @return string filtered result
     */
    protected function filter($text){
        foreach($this->filters as $filter){
            $text = $filter->filter($text);
        }
        return $text;
    }
}
?>