summaryrefslogtreecommitdiff
path: root/inc/indexer.php
blob: 3fb710338326a06b197ed25fb7954b167cfbdb6a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
<?php
/**
 * Common DokuWiki functions
 *
 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
 * @author     Andreas Gohr <andi@splitbrain.org>
 */

  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
  require_once(DOKU_CONF.'dokuwiki.php');
  require_once(DOKU_INC.'inc/io.php');
  require_once(DOKU_INC.'inc/utf8.php');
  require_once(DOKU_INC.'inc/parserutils.php');

/**
 * based upon class.search_indexer_phpcms.php::index_entry
 */
function idx_getPageWords($id){
    $body  = rawWiki($id);
    $body  = utf8_stripspecials($body,' ','._\-:');
    $body  = utf8_strtolower($body);
    $body  = trim($body);
    $words = explode(' ',$body);
    sort($words);

    $index = array(); //resulting index
    $old   = '';
    $doit  = true;
    $pos   = 0;

    //compact wordlist FIXME check for stopwords

    foreach($words as $word){
        if(strlen($word) == 0) continue;

        // it's the same word
        if($word == $old){
            if($doit == false) {
                // we didn't wanted it last time
                continue;
            }
            // just increase the counter
            $index[$word]++;
            continue;
        }

        // rememember old word
        $old  = $word;
        $doit = true;

        // checking minimum word-size (excepting numbers)
        if(!is_numeric($word)) {
            if(strlen($word) < 3) {  #FIXME add config option for max wordsize
                $doit = false;
                continue;
            }
        }
      
        //FIXME add stopword check

        // add to index
        $index[$word] = 1;
    }

    return $index;
}



//Setup VIM: ex: et ts=4 enc=utf-8 :