diff options
author | Andreas Gohr <andi@splitbrain.org> | 2005-08-07 22:33:22 +0200 |
---|---|---|
committer | Andreas Gohr <andi@splitbrain.org> | 2005-08-07 22:33:22 +0200 |
commit | b4ce25e9a449e7a6a78476bf94bca31cbc4259ce (patch) | |
tree | 4c76c49de6039340312c7e7e87ac2b1e8e256726 /inc | |
parent | 1caeb00a5a0b9894a582514ef385b71cab195092 (diff) | |
download | rpg-b4ce25e9a449e7a6a78476bf94bca31cbc4259ce.tar.gz rpg-b4ce25e9a449e7a6a78476bf94bca31cbc4259ce.tar.bz2 |
a first step for search indexing - nothing to see yet
darcs-hash:20050807203322-7ad00-6db6733f8fcd861366856635ba3d205fd3bb54da.gz
Diffstat (limited to 'inc')
-rw-r--r-- | inc/indexer.php | 70 | ||||
-rw-r--r-- | inc/utf8.php | 9 |
2 files changed, 75 insertions, 4 deletions
diff --git a/inc/indexer.php b/inc/indexer.php new file mode 100644 index 000000000..3fb710338 --- /dev/null +++ b/inc/indexer.php @@ -0,0 +1,70 @@ +<?php +/** + * Common DokuWiki functions + * + * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) + * @author Andreas Gohr <andi@splitbrain.org> + */ + + if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); + require_once(DOKU_CONF.'dokuwiki.php'); + require_once(DOKU_INC.'inc/io.php'); + require_once(DOKU_INC.'inc/utf8.php'); + require_once(DOKU_INC.'inc/parserutils.php'); + +/** + * based upon class.search_indexer_phpcms.php::index_entry + */ +function idx_getPageWords($id){ + $body = rawWiki($id); + $body = utf8_stripspecials($body,' ','._\-:'); + $body = utf8_strtolower($body); + $body = trim($body); + $words = explode(' ',$body); + sort($words); + + $index = array(); //resulting index + $old = ''; + $doit = true; + $pos = 0; + + //compact wordlist FIXME check for stopwords + + foreach($words as $word){ + if(strlen($word) == 0) continue; + + // it's the same word + if($word == $old){ + if($doit == false) { + // we didn't wanted it last time + continue; + } + // just increase the counter + $index[$word]++; + continue; + } + + // rememember old word + $old = $word; + $doit = true; + + // checking minimum word-size (excepting numbers) + if(!is_numeric($word)) { + if(strlen($word) < 3) { #FIXME add config option for max wordsize + $doit = false; + continue; + } + } + + //FIXME add stopword check + + // add to index + $index[$word] = 1; + } + + return $index; +} + + + +//Setup VIM: ex: et ts=4 enc=utf-8 : diff --git a/inc/utf8.php b/inc/utf8.php index 3eb06865f..176b9f813 100644 --- a/inc/utf8.php +++ b/inc/utf8.php @@ -294,10 +294,11 @@ function utf8_deaccent($string,$case=0){ * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) * * @author Andreas Gohr <andi@splitbrain.org> - * @param string $string The UTF8 string to strip of special chars - * @param string $repl Replace special with this string + * @param string $string The UTF8 string to strip of special chars + * @param string $repl Replace special with this string + * @param string $additional Additional chars to strip (used in regexp char class) */ -function utf8_stripspecials($string,$repl=''){ +function utf8_stripspecials($string,$repl='',$additional=''){ global $UTF8_SPECIAL_CHARS; static $specials = null; @@ -305,7 +306,7 @@ function utf8_stripspecials($string,$repl=''){ $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); } - return preg_replace('/[\x00-\x19'.$specials.']/u',$repl,$string); + return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); } /** |