From b4ce25e9a449e7a6a78476bf94bca31cbc4259ce Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Sun, 7 Aug 2005 22:33:22 +0200 Subject: a first step for search indexing - nothing to see yet darcs-hash:20050807203322-7ad00-6db6733f8fcd861366856635ba3d205fd3bb54da.gz --- inc/indexer.php | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 inc/indexer.php (limited to 'inc/indexer.php') diff --git a/inc/indexer.php b/inc/indexer.php new file mode 100644 index 000000000..3fb710338 --- /dev/null +++ b/inc/indexer.php @@ -0,0 +1,70 @@ + + */ + + if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); + require_once(DOKU_CONF.'dokuwiki.php'); + require_once(DOKU_INC.'inc/io.php'); + require_once(DOKU_INC.'inc/utf8.php'); + require_once(DOKU_INC.'inc/parserutils.php'); + +/** + * based upon class.search_indexer_phpcms.php::index_entry + */ +function idx_getPageWords($id){ + $body = rawWiki($id); + $body = utf8_stripspecials($body,' ','._\-:'); + $body = utf8_strtolower($body); + $body = trim($body); + $words = explode(' ',$body); + sort($words); + + $index = array(); //resulting index + $old = ''; + $doit = true; + $pos = 0; + + //compact wordlist FIXME check for stopwords + + foreach($words as $word){ + if(strlen($word) == 0) continue; + + // it's the same word + if($word == $old){ + if($doit == false) { + // we didn't wanted it last time + continue; + } + // just increase the counter + $index[$word]++; + continue; + } + + // rememember old word + $old = $word; + $doit = true; + + // checking minimum word-size (excepting numbers) + if(!is_numeric($word)) { + if(strlen($word) < 3) { #FIXME add config option for max wordsize + $doit = false; + continue; + } + } + + //FIXME add stopword check + + // add to index + $index[$word] = 1; + } + + return $index; +} + + + +//Setup VIM: ex: et ts=4 enc=utf-8 : -- cgit v1.2.3