diff options
author | Andreas Gohr <andi@splitbrain.org> | 2005-08-14 20:10:35 +0200 |
---|---|---|
committer | Andreas Gohr <andi@splitbrain.org> | 2005-08-14 20:10:35 +0200 |
commit | 7367b36877bca568d785e01be802652b6a719884 (patch) | |
tree | a1e27b5806ba835d43f69373cf83247953cd1ee1 | |
parent | 48665d389b9bb386283c08172b24f3af26628bce (diff) | |
download | rpg-7367b36877bca568d785e01be802652b6a719884.tar.gz rpg-7367b36877bca568d785e01be802652b6a719884.tar.bz2 |
added stopword support to the indexer, added indexer webbug
darcs-hash:20050814181035-7ad00-ed5d879d29fcee7f925f806456675605b058966a.gz
-rw-r--r-- | inc/indexer.php | 14 | ||||
-rw-r--r-- | inc/init.php | 19 | ||||
-rw-r--r-- | inc/lang/cs/stopwords.txt | 115 | ||||
-rw-r--r-- | inc/lang/da/stopwords.txt | 88 | ||||
-rw-r--r-- | inc/lang/de/stopwords.txt | 122 | ||||
-rw-r--r-- | inc/lang/en/stopwords.txt | 28 | ||||
-rw-r--r-- | inc/lang/es/stopwords.txt | 171 | ||||
-rw-r--r-- | inc/lang/fr/stopwords.txt | 111 | ||||
-rw-r--r-- | inc/lang/hu/stopwords.txt | 28 | ||||
-rw-r--r-- | inc/lang/it/stopwords.txt | 119 | ||||
-rw-r--r-- | inc/lang/nl/stopwords.txt | 37 | ||||
-rw-r--r-- | inc/lang/no/stopwords.txt | 108 | ||||
-rw-r--r-- | inc/lang/pl/stopwords.txt | 75 | ||||
-rw-r--r-- | inc/lang/pt-br/stopwords.txt | 141 | ||||
-rw-r--r-- | inc/lang/pt/stopwords.txt | 141 | ||||
-rw-r--r-- | inc/template.php | 18 | ||||
-rw-r--r-- | lib/tpl/default/main.php | 2 |
17 files changed, 1334 insertions, 3 deletions
diff --git a/inc/indexer.php b/inc/indexer.php index 173b7aa3c..45eca2d8b 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -24,6 +24,12 @@ function idx_getPageWords($page){ global $conf; $word_idx = file($conf['cachedir'].'/word.idx'); + $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; + if(@file_exists($swfile)){ + $stopwords = file($swfile); + }else{ + $stopwords = array(); + } // split page into words $body = rawWiki($page); @@ -60,13 +66,17 @@ function idx_getPageWords($page){ // checking minimum word-size (excepting numbers) if(!is_numeric($word)) { - if(strlen($word) < 3) { #FIXME add config option for max wordsize + if(strlen($word) < 3) { $doit = false; continue; } } - //FIXME add stopword check + // stopword check + if(is_int(array_search("$word\n",$stopwords))){ + $doit = false; + continue; + } // get word ID $wid = array_search("$word\n",$word_idx); diff --git a/inc/init.php b/inc/init.php index 4e59cdbe1..d3afe1174 100644 --- a/inc/init.php +++ b/inc/init.php @@ -71,6 +71,7 @@ // make real paths and check them init_paths(); + init_files(); // automatic upgrade to script versions of certain files scriptify(DOKU_CONF.'users.auth'); @@ -92,7 +93,6 @@ function init_paths(){ 'changelog' => 'changes.log'); foreach($paths as $c => $p){ - if(!$conf[$c]) $conf[$c] = $conf['savedir'].'/'.$p; $conf[$c] = init_path($conf[$c]); if(!$conf[$c]) die("$c does not exist or isn't writable. Check config!"); @@ -100,6 +100,23 @@ function init_paths(){ } /** + * Checks the existance of certain files and creates them if missing + */ +function init_files(){ + global $conf; + $files = array( $conf['cachedir'].'/word.idx', + $conf['cachedir'].'/page.idx', + $conf['cachedir'].'/index.idx', ); + + foreach($files as $file){ + if(!@file_exists($file)){ + $fh = fopen($file,'a'); + fclose($fh); + } + } +} + +/** * returns absolute path * * This tries the given path first, then checks in DOKU_INC diff --git a/inc/lang/cs/stopwords.txt b/inc/lang/cs/stopwords.txt new file mode 100644 index 000000000..08aee2a4c --- /dev/null +++ b/inc/lang/cs/stopwords.txt @@ -0,0 +1,115 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +dnes +timto +budes +budem +byli +jses +muj +svym +tomto +tohle +tuto +tyto +jej +zda +proc +mate +tato +kam +tohoto +kdo +kteri +nam +tom +tomuto +mit +nic +proto +kterou +byla +toho +protoze +asi +nasi +napiste +coz +tim +takze +svych +jeji +svymi +jste +tedy +teto +bylo +kde +prave +nad +nejsou +pod +tema +mezi +pres +pak +vam +ani +kdyz +vsak +jsem +tento +clanku +clanky +aby +jsme +pred +pta +jejich +byl +jeste +bez +take +pouze +prvni +vase +ktera +nas +novy +tipy +pokud +muze +design +strana +jeho +sve +jine +zpravy +nove +neni +vas +jen +podle +zde +clanek +email +byt +vice +bude +jiz +nez +ktery +ktere +nebo +ten +tak +pri +jsou +jak +dalsi +ale +jako +zpet +pro diff --git a/inc/lang/da/stopwords.txt b/inc/lang/da/stopwords.txt new file mode 100644 index 000000000..f4b58b6d1 --- /dev/null +++ b/inc/lang/da/stopwords.txt @@ -0,0 +1,88 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +alle +andet +andre +begge +den +denne +der +deres +det +dette +dig +din +dog +eller +end +ene +eneste +enhver +fem +fire +flere +fleste +for +fordi +forrige +fra +få +før +god +han +hans +har +hendes +her +hun +hvad +hvem +hver +hvilken +hvis +hvor +hvordan +hvorfor +hvornår +ikke +ind +ingen +intet +jeg +jeres +kan +kom +kommer +lav +lidt +lille +man +mand +mange +med +meget +men +mens +mere +mig +ned +nogen +noget +nyt +nær +næste +næsten +otte +over +på +seks +ses +som +stor +store +syv +til +tre +var diff --git a/inc/lang/de/stopwords.txt b/inc/lang/de/stopwords.txt new file mode 100644 index 000000000..78261b7c9 --- /dev/null +++ b/inc/lang/de/stopwords.txt @@ -0,0 +1,122 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +aber +als +auch +auf +aus +bei +bin +bis +bist +dadurch +daher +darum +das +daß +dass +dein +deine +dem +den +der +des +dessen +deshalb +die +dies +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +euer +eure +für +hatte +hatten +hattest +hattet +hier +hinter +ich +ihr +ihre +ist +jede +jedem +jeden +jeder +jedes +jener +jenes +jetzt +kann +kannst +können +könnt +machen +mein +meine +mit +muß +mußt +musst +müssen +müßt +nach +nachdem +nein +nicht +nun +oder +seid +sein +seine +sich +sie +sind +soll +sollen +sollst +sollt +sonst +soweit +sowie +und +unser +unsere +unter +vom +von +vor +wann +warum +was +weiter +weitere +wenn +wer +werde +werden +werdet +weshalb +wie +wieder +wieso +wir +wird +wirst +woher +wohin +zum +zur +über diff --git a/inc/lang/en/stopwords.txt b/inc/lang/en/stopwords.txt new file mode 100644 index 000000000..478fb33ef --- /dev/null +++ b/inc/lang/en/stopwords.txt @@ -0,0 +1,28 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +about +are +and +you +your +them +their +com +for +from +how +that +the +this +was +what +when +where +who +will +with +und +the +www diff --git a/inc/lang/es/stopwords.txt b/inc/lang/es/stopwords.txt new file mode 100644 index 000000000..1e7e2881d --- /dev/null +++ b/inc/lang/es/stopwords.txt @@ -0,0 +1,171 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +una +unas +unos +uno +sobre +todo +también +tras +otro +algún +alguno +alguna +algunos +algunas +ser +soy +eres +somos +sois +estoy +esta +estamos +estais +estan +como +para +atras +porque +por +qué +estado +estaba +ante +antes +siendo +ambos +pero +poder +puede +puedo +podemos +podeis +pueden +fui +fue +fuimos +fueron +hacer +hago +hace +hacemos +haceis +hacen +cada +fin +incluso +primero +desde +conseguir +consigo +consigue +consigues +conseguimos +consiguen +voy +va +vamos +vais +van +vaya +gueno +tener +tengo +tiene +tenemos +teneis +tienen +las +los +aqui +mio +tuyo +ellos +ellas +nos +nosotros +vosotros +vosotras +dentro +solo +solamente +saber +sabes +sabe +sabemos +sabeis +saben +ultimo +largo +bastante +haces +muchos +aquellos +aquellas +sus +entonces +tiempo +verdad +verdadero +verdadera +cierto +ciertos +cierta +ciertas +intentar +intento +intenta +intentas +intentamos +intentais +intentan +dos +bajo +arriba +encima +usar +uso +usas +usa +usamos +usais +usan +emplear +empleo +empleas +emplean +ampleamos +empleais +valor +muy +era +eras +eramos +eran +modo +bien +cual +cuando +donde +mientras +quien +con +entre +sin +trabajo +trabajar +trabajas +trabaja +trabajamos +trabajais +trabajan +podria +podrias +podriamos +podrian +podriais +aquel diff --git a/inc/lang/fr/stopwords.txt b/inc/lang/fr/stopwords.txt new file mode 100644 index 000000000..7d673aa42 --- /dev/null +++ b/inc/lang/fr/stopwords.txt @@ -0,0 +1,111 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +alors +aucuns +aussi +autre +avant +avec +avoir +bon +car +cela +ces +ceux +chaque +comme +comment +dans +des +dedans +dehors +depuis +deux +devrait +doit +donc +dos +droite +début +elle +elles +encore +essai +est +fait +faites +fois +font +force +haut +hors +ici +ils +juste +les +leur +là +maintenant +mais +mes +mine +moins +mon +mot +même +nommés +notre +nous +nouveaux +où +par +parce +parole +pas +personnes +peut +peu +pièce +plupart +pour +pourquoi +quand +que +quel +quelle +quelles +quels +qui +sans +ses +seulement +sien +son +sont +sous +soyez +sujet +sur +tandis +tellement +tels +tes +ton +tous +tout +trop +très +valeur +voie +voient +vont +votre +vous +ça +étaient +état +étions +été +être diff --git a/inc/lang/hu/stopwords.txt b/inc/lang/hu/stopwords.txt new file mode 100644 index 000000000..80282410f --- /dev/null +++ b/inc/lang/hu/stopwords.txt @@ -0,0 +1,28 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +egy +fel +meg +át +rá +ide +oda +szét +össze +vissza +hát +és +vagy +hogy +van +lesz +volt +csak +nem +igen +mint +én +õk +ön diff --git a/inc/lang/it/stopwords.txt b/inc/lang/it/stopwords.txt new file mode 100644 index 000000000..a6aa1cfc6 --- /dev/null +++ b/inc/lang/it/stopwords.txt @@ -0,0 +1,119 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +adesso +alla +allo +allora +altre +altri +altro +anche +ancora +avere +aveva +avevano +ben +buono +che +chi +cinque +comprare +con +consecutivi +consecutivo +cosa +cui +del +della +dello +dentro +deve +devo +doppio +due +ecco +fare +fine +fino +fra +gente +giu +hai +hanno +indietro +invece +lavoro +lei +loro +lui +lungo +meglio +molta +molti +molto +nei +nella +noi +nome +nostro +nove +nuovi +nuovo +oltre +ora +otto +peggio +pero +persone +piu +poco +primo +promesso +qua +quarto +quasi +quattro +quello +questo +qui +quindi +quinto +rispetto +sara +secondo +sei +sembra +sembrava +senza +sette +sia +siamo +siete +solo +sono +sopra +soprattutto +sotto +stati +stato +stesso +su +subito +sul +sulla +tanto +tempo +terzo +tra +tre +triplo +ultimo +una +uno +va +vai +voi +volte +vostro diff --git a/inc/lang/nl/stopwords.txt b/inc/lang/nl/stopwords.txt new file mode 100644 index 000000000..3056c4a70 --- /dev/null +++ b/inc/lang/nl/stopwords.txt @@ -0,0 +1,37 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +aan +als +bij +dan +dat +die +dit +een +had +heb +hem +het +hij +hoe +hun +kan +men +met +mij +nog +ons +ook +tot +uit +van +was +wat +wel +wij +zal +zei +zij +zou diff --git a/inc/lang/no/stopwords.txt b/inc/lang/no/stopwords.txt new file mode 100644 index 000000000..c7e672115 --- /dev/null +++ b/inc/lang/no/stopwords.txt @@ -0,0 +1,108 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +alle +andre +arbeid +begge +bort +bra +bruke +denne +der +deres +det +din +disse +eller +ene +eneste +enhver +enn +folk +for +fordi +forsÛke +fra +fÅ +fÛr +fÛrst +gjorde +gjÛre +god +gÅ +hadde +han +hans +hennes +her +hva +hvem +hver +hvilken +hvis +hvor +hvordan +hvorfor +ikke +inn +innen +kan +kunne +lage +lang +lik +like +makt +mange +med +meg +meget +men +mens +mer +mest +min +mye +mÅ +mÅte +navn +nei +nÅ +nÅr +ogsÅ +opp +oss +over +part +punkt +pÅ +rett +riktig +samme +sant +siden +sist +skulle +slik +slutt +som +start +stille +sÅ +tid +til +tilbake +tilstand +under +uten +var +ved +verdi +vil +ville +vite +vÅr +vÖre +vÖrt diff --git a/inc/lang/pl/stopwords.txt b/inc/lang/pl/stopwords.txt new file mode 100644 index 000000000..8089e22c5 --- /dev/null +++ b/inc/lang/pl/stopwords.txt @@ -0,0 +1,75 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +aby +ale +bardziej +bardzo +bez +bowiem +bêdzie +czy +czyli +dla +dlatego +gdy +gdzie +ich +innych +jak +jako +jednak +jego +jej +jest +jeszcze +kiedy +kilka +która +które +którego +której +który +których +którym +którzy +lub +miêdzy +mnie +nad +nam +nas +naszego +naszych +nawet +nich +nie +nim +oraz +pod +poza +przed +przede +przez +przy +siê +sobie +swoje +tak +takie +tam +te +tego +tej +ten +tych +tylko +tym +wiele +wielu +wiêc +wszystkich +wszystkim +wszystko +zawsze diff --git a/inc/lang/pt-br/stopwords.txt b/inc/lang/pt-br/stopwords.txt new file mode 100644 index 000000000..6abeff502 --- /dev/null +++ b/inc/lang/pt-br/stopwords.txt @@ -0,0 +1,141 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +último +acerca +agora +algmas +alguns +ali +ambos +antes +apontar +aquela +aquelas +aquele +aqueles +aqui +atrás +bem +bom +cada +caminho +cima +com +como +comprido +conhecido +corrente +das +debaixo +dentro +desde +desligado +deve +devem +deverá +direita +diz +dizer +dois +dos +ela +ele +eles +enquanto +então +está +estão +estado +estar +estará +este +estes +esteve +estive +estivemos +estiveram +fará +faz +fazer +fazia +fez +fim +foi +fora +horas +iniciar +inicio +irá +ista +iste +isto +ligado +maioria +maiorias +mais +mas +mesmo +meu +muito +muitos +nós +não +nome +nosso +novo +onde +outro +para +parte +pegar +pelo +pessoas +pode +poderá +podia +por +porque +povo +promeiro +quê +qual +qualquer +quando +quem +quieto +são +saber +sem +ser +seu +somente +têm +tal +também +tem +tempo +tenho +tentar +tentaram +tente +tentei +teu +teve +tipo +tive +todos +trabalhar +trabalho +uma +umas +uns +usa +usar +valor +veja +ver +verdade +verdadeiro +você diff --git a/inc/lang/pt/stopwords.txt b/inc/lang/pt/stopwords.txt new file mode 100644 index 000000000..6abeff502 --- /dev/null +++ b/inc/lang/pt/stopwords.txt @@ -0,0 +1,141 @@ +# This is a list of words the indexer ignores, one word per line +# When you edit this file be sure to use UNIX line endings (single newline) +# No need to include words shorter than 3 chars - these are ignored anyway +# This list is based upon the ones found at http://www.ranks.nl/stopwords/ +último +acerca +agora +algmas +alguns +ali +ambos +antes +apontar +aquela +aquelas +aquele +aqueles +aqui +atrás +bem +bom +cada +caminho +cima +com +como +comprido +conhecido +corrente +das +debaixo +dentro +desde +desligado +deve +devem +deverá +direita +diz +dizer +dois +dos +ela +ele +eles +enquanto +então +está +estão +estado +estar +estará +este +estes +esteve +estive +estivemos +estiveram +fará +faz +fazer +fazia +fez +fim +foi +fora +horas +iniciar +inicio +irá +ista +iste +isto +ligado +maioria +maiorias +mais +mas +mesmo +meu +muito +muitos +nós +não +nome +nosso +novo +onde +outro +para +parte +pegar +pelo +pessoas +pode +poderá +podia +por +porque +povo +promeiro +quê +qual +qualquer +quando +quem +quieto +são +saber +sem +ser +seu +somente +têm +tal +também +tem +tempo +tenho +tentar +tentaram +tente +tentei +teu +teve +tipo +tive +todos +trabalhar +trabalho +uma +umas +uns +usa +usar +valor +veja +ver +verdade +verdadeiro +você diff --git a/inc/template.php b/inc/template.php index 3937d99d2..a17d16f56 100644 --- a/inc/template.php +++ b/inc/template.php @@ -867,4 +867,22 @@ function tpl_img($maxwidth=900,$maxheight=700){ print '</a>'; } +/** + * This function inserts a 1x1 pixel gif which in reality + * is the inexer function. + * + * Should be called somewhere at the very end of the main.php + * template + */ +function tpl_indexerWebBug(){ + global $ID; + $p = array(); + $p['src'] = DOKU_BASE.'lib/exe/indexer.php?id='.urlencode($ID); + $p['width'] = 1; + $p['height'] = 1; + $p['alt'] = ''; + $att = buildAttributes($p); + print "<img $att />"; +} + //Setup VIM: ex: et ts=2 enc=utf-8 : diff --git a/lib/tpl/default/main.php b/lib/tpl/default/main.php index 3a3a3b5aa..a2ec79a95 100644 --- a/lib/tpl/default/main.php +++ b/lib/tpl/default/main.php @@ -126,5 +126,7 @@ </div> <?php /*old includehook*/ @include(dirname(__FILE__).'/footer.html')?> + +<?php tpl_indexerWebBug()?> </body> </html> |