summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Gohr <andi@splitbrain.org>2005-09-18 14:10:08 +0200
committerAndreas Gohr <andi@splitbrain.org>2005-09-18 14:10:08 +0200
commitd437bcc4a25f62fad65e625f4bc13cab8873f994 (patch)
tree8036cb2794b499b87ff9d3a193148e181e9a8914
parentb7af031cf80338a83187f1fb75cfacf0bb24c103 (diff)
downloadrpg-d437bcc4a25f62fad65e625f4bc13cab8873f994.tar.gz
rpg-d437bcc4a25f62fad65e625f4bc13cab8873f994.tar.bz2
more efficient changelog reading for recent changes
getRecents now reads the changelog backwards in 4KB chunks instead of loading the whole file into an array and rsort it. This should be more memory efficient (and probably faster) for large change logs. Note: the format of the array returned by getRecents changed slightly plugins relying on it need to be adjusted. Sorry. darcs-hash:20050918121008-7ad00-1fdba47d29b0c038c6e4e4edc1d4c93e5ba769e9.gz
-rw-r--r--feed.php31
-rw-r--r--inc/common.php125
-rw-r--r--inc/html.php20
-rw-r--r--inc/indexer.php4
4 files changed, 120 insertions, 60 deletions
diff --git a/feed.php b/feed.php
index 9454a9b95..c4c386261 100644
--- a/feed.php
+++ b/feed.php
@@ -98,19 +98,19 @@ function rssRecentChanges(&$rss,$num,$ltype,$ns){
//this can take some time if a lot of recaching has to be done
@set_time_limit(90); // set max execution time
- foreach(array_keys($recents) as $id){
+ foreach($recents as $recent){
$item = new FeedItem();
- $item->title = $id;
- $xhtml = p_wiki_xhtml($id,'',false);
+ $item->title = $recent['id'];
+ $xhtml = p_wiki_xhtml($recent['id'],'',false);
if($conf['useheading']) {
$matches = array();
if(preg_match('|<h([1-9])>(.*?)</h\1>|', $xhtml, $matches))
$item->title = trim($matches[2]);
}
- if(!empty($recents[$id]['sum'])){
- $item->title .= ' - '.strip_tags($recents[$id]['sum']);
+ if(!empty($recent['sum'])){
+ $item->title .= ' - '.strip_tags($recent['sum']);
}
$desc = cleanDesc($xhtml);
@@ -120,27 +120,26 @@ function rssRecentChanges(&$rss,$num,$ltype,$ns){
switch ($ltype){
case 'page':
- $item->link = wl($id,'rev='.$recents[$id]['date'],true);
+ $item->link = wl($recent['id'],'rev='.$recent['date'],true);
break;
case 'rev':
- $item->link = wl($id,'do=revisions&amp;rev='.$recents[$id]['date'],true);
+ $item->link = wl($recent['id'],'do=revisions&amp;rev='.$recent['date'],true);
break;
case 'current':
- $item->link = wl($id, '', true);
+ $item->link = wl($recent['id'], '', true);
break;
case 'diff':
default:
- $item->link = wl($id,'do=diff&amp;'.$recents[$id]['date'],true);
+ $item->link = wl($recent['id'],'do=diff&amp;'.$recent['date'],true);
}
$item->description = $desc;
- $item->date = date('r',$recents[$id]['date']);
- if(strpos($id,':')!==false){
- $item->category = substr($id,0,strrpos($id,':'));
- }
+ $item->date = date('r',$recent['date']);
+ $cat = getNS($recent['id']);
+ if($cat) $item->category = $cat;
$user = null;
- $user = @$recents[$id]['user']; // the @ spares time repeating lookup
+ $user = @$recent['user']; // the @ spares time repeating lookup
$item->author = '';
if($user){
@@ -148,12 +147,12 @@ function rssRecentChanges(&$rss,$num,$ltype,$ns){
$item->author = $userInfo['name'];
if($guardmail) {
//cannot obfuscate because some RSS readers may check validity
- $item->authorEmail = $user.'@'.$recents[$id]['ip'];
+ $item->authorEmail = $user.'@'.$recent['ip'];
}else{
$item->authorEmail = $userInfo['mail'];
}
}else{
- $item->authorEmail = 'anonymous@'.$recents[$id]['ip'];
+ $item->authorEmail = 'anonymous@'.$recent['ip'];
}
$rss->addItem($item);
}
diff --git a/inc/common.php b/inc/common.php
index eb4b560a6..4585634fc 100644
--- a/inc/common.php
+++ b/inc/common.php
@@ -595,17 +595,70 @@ function addLogEntry($date,$id,$summary=""){
}
/**
+ * Internal function used by getRecents
+ *
+ * don't call directly
+ *
+ * @see getRecents()
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+function _handleRecent($line,$incdel,$ns,$subNS){
+ static $seen = array(); //caches seen pages and skip them
+ if(empty($line)) return false; //skip empty lines
+
+ // split the line into parts
+ list($dt,$ip,$id,$usr,$sum) = explode("\t",$line);
+
+ // skip seen ones
+ if($seen[$id]) return false;
+
+ // remember in seen to skip additional sights
+ $seen[$id] = 1;
+
+ // filter namespace
+ if (($ns) && (strpos($id,$ns.':') !== 0)) return false;
+
+ // exclude subnamespaces
+ if ((!$subNS) && (getNS($id) != $ns)) return false;
+
+ // check existance
+ if(!@file_exists(wikiFN($id))){
+ if(!$incdel){
+ return false;
+ }else{
+ $recent = array();
+ $recent['del'] = true;
+ }
+ }else{
+ $recent = array();
+ $recent['del'] = false;
+ }
+
+ $recent['id'] = $id;
+ $recent['date'] = $dt;
+ $recent['ip'] = $ip;
+ $recent['user'] = $usr;
+ $recent['sum'] = $sum;
+
+ return $recent;
+}
+
+/**
* returns an array of recently changed files using the
* changelog
- * first : first entry in array returned
- * num : return 'num' entries
+ *
+ * @param int $first number of first entry returned (for paginating
+ * @param int $num return $num entries
+ * @param bool $incdel include deleted pages?
+ * @param string $ns restrict to given namespace
+ * @param bool $subNS include subnamespaces
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
function getRecents($first,$num,$incdel=false,$ns='',$subNS=true){
global $conf;
$recent = array();
- $names = array();
+ $count = 0;
if(!$num)
return $recent;
@@ -615,37 +668,47 @@ function getRecents($first,$num,$incdel=false,$ns='',$subNS=true){
return $recent;
}
- $loglines = file($conf['changelog']);
- rsort($loglines); //reverse sort on timestamp
-
- foreach ($loglines as $line){
- $line = rtrim($line); //remove newline
- if(empty($line)) continue; //skip empty lines
- $info = split("\t",$line); //split into parts
- //add id if not in yet and file still exists and is allowed to read
- if(!$names[$info[2]] &&
- (@file_exists(wikiFN($info[2])) || $incdel) &&
- (auth_quickaclcheck($info[2]) >= AUTH_READ)
- ){
- // filter namespace
- if (($ns) && (strpos($info[2],$ns.':') !== 0)) continue;
-
- // exclude subnamespaces
- if ((!$subNS) && (getNS($info[2]) != $ns)) continue;
+ $fh = fopen($conf['changelog'],'r');
+ $buf = '';
+ $csz = 4096; //chunksize
+ fseek($fh,0,SEEK_END); // jump to the end
+ $pos = ftell($fh); // position pointer
- $names[$info[2]] = 1;
- if(--$first >= 0) continue; /* skip "first" entries */
-
- $recent[$info[2]]['date'] = $info[0];
- $recent[$info[2]]['ip'] = $info[1];
- $recent[$info[2]]['user'] = $info[3];
- $recent[$info[2]]['sum'] = $info[4];
- $recent[$info[2]]['del'] = !@file_exists(wikiFN($info[2]));
+ // now read backwards into buffer
+ while($pos > 0){
+ $pos -= $csz; // seek to previous chunk...
+ if($pos < 0) $pos = 0; // ...or rest of file
+ fseek($fh,$pos);
+
+ $buf = fread($fh,$csz).$buf; // prepend to buffer
+
+ $lines = explode("\n",$buf); // split buffer into lines
+
+ if($pos > 0){
+ $buf = array_shift($lines); // first one may be still incomplete
}
- if(count($recent) >= $num){
- break; //finish if enough items found
+
+ $cnt = count($lines);
+ if(!$cnt) continue; // no lines yet
+
+ // handle lines
+ for($i = $cnt-1; $i >= 0; $i--){
+ $rec = _handleRecent($lines[$i],$incdel,$ns,$subNS);
+ if($rec !== false){
+ if(--$first >= 0) continue; // skip first entries
+ $recent[] = $rec;
+ $count++;
+
+ // break while when we have enough entries
+ if($count >= $num){
+ $pos = 0; // will break the while loop
+ break; // will break the for loop
+ }
+ }
}
- }
+ }// end of while
+
+ fclose($fh);
return $recent;
}
diff --git a/inc/html.php b/inc/html.php
index 7c4e62cda..1e48b548c 100644
--- a/inc/html.php
+++ b/inc/html.php
@@ -469,15 +469,13 @@ function html_recent($first=0){
print p_locale_xhtml('recent');
print '<ul>';
- $keys = array_keys($recents);
- for ($n=0; $n < $cnt; $n++){
- $id = $keys[$n];
- $date = date($conf['dformat'],$recents[$id]['date']);
+ foreach($recents as $recent){
+ $date = date($conf['dformat'],$recent['date']);
print '<li>';
print $date.' ';
- print '<a href="'.wl($id,"do=diff").'">';
+ print '<a href="'.wl($recent['id'],"do=diff").'">';
$p = array();
$p['src'] = DOKU_BASE.'lib/images/diff.png';
$p['border'] = 0;
@@ -489,7 +487,7 @@ function html_recent($first=0){
print "<img $att />";
print '</a> ';
- print '<a href="'.wl($id,"do=revisions").'">';
+ print '<a href="'.wl($recent['id'],"do=revisions").'">';
$p = array();
$p['src'] = DOKU_BASE.'lib/images/history.png';
$p['border'] = 0;
@@ -501,14 +499,14 @@ function html_recent($first=0){
print "<img $att />";
print '</a> ';
- print html_wikilink(":$id",$conf['useheading']?NULL:$id);
+ print html_wikilink(':'.$recent['id'],$conf['useheading']?NULL:$recent['id']);
- print ' '.htmlspecialchars($recents[$id]['sum']);
+ print ' '.htmlspecialchars($recent['sum']);
print ' <span class="user">';
- if($recents[$id]['user']){
- print $recents[$id]['user'];
+ if($recent['user']){
+ print $recent['user'];
}else{
- print $recents[$id]['ip'];
+ print $recent['ip'];
}
print '</span>';
diff --git a/inc/indexer.php b/inc/indexer.php
index 2bc707269..fe8e74bd9 100644
--- a/inc/indexer.php
+++ b/inc/indexer.php
@@ -34,7 +34,7 @@ function idx_getPageWords($page){
$body = strtr($body, "\r\n\t", ' ');
$tokens = explode(' ', $body);
$tokens = array_count_values($tokens); // count the frequency of each token
-
+
$words = array();
foreach ($tokens as $word => $count) {
@@ -54,7 +54,7 @@ function idx_getPageWords($page){
$words[$word] = $count + (isset($words[$word]) ? $words[$word] : 0);
}
}
-
+
// arrive here with $words = array(word => frequency)
$index = array(); //resulting index