diff options
author | chris <chris@jalakai.co.uk> | 2006-11-05 20:54:53 +0100 |
---|---|---|
committer | chris <chris@jalakai.co.uk> | 2006-11-05 20:54:53 +0100 |
commit | 6b06b65228c9fbd6e8e45658458b14a0e8c2cdfc (patch) | |
tree | 119ec17eb7cb4a20b0c893730780eb9522713191 /inc/indexer.php | |
parent | 67cf9a098ec0c4e56817fb5b01e77544a472b78a (diff) | |
download | rpg-6b06b65228c9fbd6e8e45658458b14a0e8c2cdfc.tar.gz rpg-6b06b65228c9fbd6e8e45658458b14a0e8c2cdfc.tar.bz2 |
backlinks fixes (bugs #795 & #937)
- add deaccented and romanised page names to index word list
- remove stop words from tokens used in backlink search
darcs-hash:20061105195453-9b6ab-6c4989eb75782af60a3de3bddbc99a83de2b4c80.gz
Diffstat (limited to 'inc/indexer.php')
-rw-r--r-- | inc/indexer.php | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/inc/indexer.php b/inc/indexer.php index a2b7a0637..e6550c2e4 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -48,6 +48,21 @@ function idx_getPageWords($page){ $tokens = explode(' ', $body); $tokens = array_count_values($tokens); // count the frequency of each token +// ensure the deaccented or romanised page names of internal links are added to the token array +// (this is necessary for the backlink function -- there maybe a better way!) + if ($conf['deaccent']) { + $links = p_get_metadata($page,'relation references'); + + $tmp = join(' ',array_keys($links)); // make a single string + $tmp = strtr($tmp, ':', ' '); // replace namespace separator with a space + $link_tokens = array_unique(explode(' ', $tmp)); // break into tokens + + foreach ($link_tokens as $link_token) { + if (isset($tokens[$link_token])) continue; + $tokens[$link_token] = 1; + } + } + $words = array(); foreach ($tokens as $word => $count) { // simple filter to restrict use of utf8_stripspecials |