Main Page | Namespace List | Class Hierarchy | Class List | File List | Class Members | File Members | Related Pages

rebuildlinks.inc

Go to the documentation of this file.
00001 <?php 00002 00003 die( "rebuildLinks.inc needs to be updated for the new schema\n" ); 00004 00005 # Functions for rebuilding the link tracking tables; must 00006 # be included within a script that also includes the Setup. 00007 # See rebuildlinks.php, for example. 00008 # 00009 00010 # Buffer this many rows before inserting them all in one sweep. More 00011 # than about 1000 will probably not increase speed significantly on 00012 # most setups. 00013 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB 00014 00015 function rebuildLinkTables() 00016 { 00017 error_reporting (E_ALL); 00018 global $wgLang, $wgLinkCache, $rowbuf_size; 00019 00020 print "This script may take several hours to complete. If you abort during that time,\n"; 00021 print "your wiki will be in an inconsistent state. If you are going to abort, this is\n"; 00022 print "the time to do it.\n\n"; 00023 print "Press control-c to abort (will proceed automatically in 15 seconds)\n"; 00024 sleep(15); 00025 00026 $count = 0; 00027 print "Rebuilding link tables.\n"; 00028 00029 print "Setting AUTOCOMMIT=1\n"; 00030 wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE); 00031 00032 print "Extracting often used data from cur (may take a few minutes)\n"; 00033 $sql = "CREATE TEMPORARY TABLE cur_fast SELECT cur_namespace, cur_title, cur_id FROM cur"; 00034 wfQuery( $sql, DB_WRITE ); 00035 $sql = "ALTER TABLE cur_fast ADD INDEX(cur_namespace, cur_title)"; 00036 wfQuery( $sql, DB_WRITE ); 00037 00038 print "Locking tables\n"; 00039 $sql = "LOCK TABLES cur READ, cur_fast READ, interwiki READ, user_newtalk READ, " . 00040 "links WRITE, brokenlinks WRITE, imagelinks WRITE"; 00041 wfQuery( $sql, DB_WRITE ); 00042 00043 00044 print "Deleting old data in links table.\n"; 00045 $sql = "DELETE FROM links"; 00046 wfQuery( $sql, DB_WRITE ); 00047 00048 print "Deleting old data in brokenlinks table.\n"; 00049 $sql = "DELETE FROM brokenlinks"; 00050 wfQuery( $sql, DB_WRITE ); 00051 00052 print "Deleting old data in imagelinks table.\n"; 00053 $sql = "DELETE FROM imagelinks"; 00054 wfQuery( $sql, DB_WRITE ); 00055 00056 print "Finding number of articles to process... "; 00057 $sql = "SELECT COUNT(*) as count FROM cur"; 00058 $res = wfQuery( $sql, DB_READ ); 00059 $obj = wfFetchObject( $res ); 00060 $total = $obj->count; 00061 print "$total\n"; 00062 00063 print "Finding highest article id\n"; 00064 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur"; 00065 $res = wfQuery( $sql, DB_READ ); 00066 $obj = wfFetchObject( $res ); 00067 00068 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " . 00069 "FROM cur WHERE cur_id ", 00070 $obj->min, $obj->max, 100); 00071 00072 $brokenlinks_inserter = new InsertBuffer( 00073 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size); 00074 00075 $links_inserter = new InsertBuffer( 00076 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size); 00077 00078 $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ". 00079 "(il_from,il_to) VALUES ", $rowbuf_size); 00080 00081 print "Starting processing\n"; 00082 00083 $ins = $wgLang->getNsText( Namespace::getImage() ); 00084 $inslen = strlen($ins)+1; 00085 00086 $tc = Title::legalChars(); 00087 00088 $titleCache = new MRUCache( 10000 ); 00089 $titlecount = 0; 00090 $start_time = time(); 00091 00092 while ( $row = $cur_pulser->next() ) { 00093 00094 $from_id = intval($row->cur_id); 00095 $ns = $wgLang->getNsText( $row->cur_namespace ); 00096 $from_full_title = $row->cur_title; 00097 if ( "" != $ns ) { 00098 $from_full_title = "$ns:{$from_full_title}"; 00099 } 00100 $from_full_title_with_slashes = addslashes( $from_full_title ); 00101 $text = $row->cur_text; 00102 00103 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text, 00104 $m, PREG_PATTERN_ORDER ); 00105 00106 $seen_dbtitles = array(); // seen links (normalized and with ns, see below) 00107 $titles_ready_for_insertion = array(); 00108 $titles_needing_curdata = array(); 00109 $titles_needing_curdata_pos = array(); 00110 $links_corresponding_to_titles = array(); 00111 00112 for ( $i = 0 ; $i < $numlinks; ++$i ) { 00113 $link = $m[1][$i]; 00114 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) { 00115 # an URL link; not for us! 00116 continue; 00117 } 00118 00119 # FIXME: Handle subpage links 00120 $nt = $titleCache->get( $link ); 00121 if( $nt != false ){ 00122 // Only process each unique link once per page 00123 $nt_key = $nt->getDBkey() . $nt->getNamespace(); 00124 if( isset( $seen_dbtitles[$nt_key] ) ) 00125 continue; 00126 $seen_dbtitles[$nt_key] = 1; 00127 00128 $titles_ready_for_insertion[] = $nt; 00129 } else { 00130 $nt = Title::newFromText( $link ); 00131 if (! $nt) { 00132 // Invalid link, probably something like "[[ ]]" 00133 continue; 00134 } 00135 00136 // Only process each unique link once per page 00137 $nt_key = $nt->getDBkey() . $nt->getNamespace(); 00138 if( isset( $seen_dbtitles[$nt_key] ) ) 00139 continue; 00140 $seen_dbtitles[$nt_key] = 1; 00141 00142 if( $nt->getInterwiki() != "" ) { 00143 # Interwiki links are not stored in the link tables 00144 continue; 00145 } 00146 if( $nt->getNamespace() == Namespace::getSpecial() ) { 00147 # Special links not stored in link tables 00148 continue; 00149 } 00150 if( $nt->getNamespace() == Namespace::getMedia() ) { 00151 # treat media: links as image: links 00152 $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() ); 00153 } 00154 $nt->mArticleID = 0; // assume broken link until proven otherwise 00155 00156 $pos = array_push($titles_needing_curdata, $nt) - 1; 00157 $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos; 00158 $links_corresponding_to_titles[] = $link; 00159 unset( $link ); // useless outside this loop, but tempting 00160 } 00161 } 00162 00163 00164 if ( count( $titles_needing_curdata ) > 0 ){ 00165 $parts = array(); 00166 foreach ($titles_needing_curdata as $nt ) { 00167 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " . 00168 "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')"; 00169 } 00170 $sql = "SELECT cur_namespace, cur_title, cur_id FROM cur_fast WHERE " . 00171 implode(" OR ", $parts); 00172 $res = wfQuery( $sql, DB_WRITE ); 00173 while($row = wfFetchObject( $res ) ){ 00174 $pos = $titles_needing_curdata_pos[$row->cur_title . $row->cur_namespace]; 00175 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id); 00176 } 00177 for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) { 00178 $tmplink = $links_corresponding_to_titles[$k]; 00179 $titleCache->set( $tmplink, $titles_needing_curdata[$k] ); 00180 $titles_ready_for_insertion[] = $titles_needing_curdata[$k]; 00181 } 00182 } 00183 00184 foreach ( $titles_ready_for_insertion as $nt ) { 00185 $dest_noslashes = $nt->getPrefixedDBkey(); 00186 $dest = addslashes( $dest_noslashes ); 00187 $dest_id = $nt->getArticleID(); 00188 $from = $from_full_title_with_slashes; 00189 00190 # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n"; 00191 00192 if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) { 00193 $iname = addslashes( substr( $dest_noslashes, $inslen ) ); 00194 $imagelinks_inserter->insert( "('{$from}','{$iname}')" ); 00195 } else if ( 0 == $dest_id ) { 00196 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" ); 00197 } else { 00198 $links_inserter->insert( "('{$from}',{$dest_id})" ); 00199 } 00200 $titlecount++; 00201 } 00202 00203 if ( ( $count % 20 ) == 0 ) 00204 print "."; 00205 00206 if ( ( ++$count % 1000 ) == 0 ) { 00207 $dt = time() - $start_time; 00208 $start_time = time(); 00209 $rps = persec(1000, $dt); 00210 $tps = persec($titlecount, $dt); 00211 $titlecount = 0; 00212 print "\n$count of $total articles scanned ({$rps} articles ". 00213 "and {$tps} titles per second)\n"; 00214 print "Title cache hits: " . $titleCache->getPerformance() . "%\n"; 00215 00216 } 00217 00218 } 00219 00220 print "\nFlushing insertion buffers..."; 00221 $imagelinks_inserter->flush(); 00222 $links_inserter->flush(); 00223 $brokenlinks_inserter->flush(); 00224 print "ok\n"; 00225 00226 print "$count articles scanned.\n"; 00227 00228 $sql = "UNLOCK TABLES"; 00229 wfQuery( $sql, DB_WRITE ); 00230 print "Done\n"; 00231 } 00232 00233 /* private */ function persec($n, $t){ 00234 if($n == 0) 00235 return "zero"; 00236 if($t == 0) 00237 return "lots of"; 00238 return intval($n/$t); 00239 } 00240 00241 # InsertBuffer increases performance slightly by inserting many rows 00242 # at once. The gain is small (<5%) when running against a local, idle 00243 # database, but may be significant in other circumstances. It also 00244 # limits the number of inserted rows uppwards, which should avoid 00245 # problems with huge articles and certain mysql settings that limits 00246 # the size of queries. It's also convenient. 00247 00248 class InsertBuffer { 00249 /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize; 00250 00251 function InsertBuffer( $sql, $bufsize ){ 00252 $this->mSql = $sql; 00253 $this->mBuf = array(); 00254 $this->mBufcount = 0; 00255 $this->mMaxsize = $bufsize; 00256 } 00257 00258 function insert( $value ){ 00259 // print $this->mSql . " -> " . $value . "\n"; 00260 $this->mBuf[] = $value; 00261 $this->mBufcount++; 00262 if($this->mBufcount > $this->mMaxsize){ 00263 $this->flush(); 00264 } 00265 } 00266 00267 function flush(){ 00268 if( $this->mBufcount > 0 ){ 00269 $sql = $this->mSql . implode(",", $this->mBuf); 00270 wfQuery( $sql, DB_WRITE ); 00271 $this->mBuf = array(); 00272 $this->mBufcount = 0; 00273 // print "Wrote query of size " . strlen( $sql ) . "\n"; 00274 } 00275 } 00276 00277 } 00278 00279 # Select parts from a large table by using the "BETWEEN X AND Y" 00280 # operator on the id column. Avoids buffering the whole thing in 00281 # RAM. It's also convenient. 00282 00283 class SelectPulser { 00284 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet; 00285 00286 function SelectPulser( $sql, $min, $max, $setsize) { 00287 $this->mSql = $sql; 00288 $this->mSet = array(); 00289 $this->mPos = $min; 00290 $this->mMax = $max; 00291 $this->mSetsize = $setsize; 00292 } 00293 00294 function next(){ 00295 $result = current( $this->mSet ); 00296 next( $this->mSet ); 00297 if( false !== $result ){ 00298 return $result; 00299 } 00300 while( $this->mPos <= $this->mMax ){ 00301 $this->mSet = array(); 00302 $sql = $this->mSql . " BETWEEN " . $this->mPos . 00303 " AND " . ($this->mPos + $this->mSetsize - 1); 00304 $this->mPos += $this->mSetsize; 00305 00306 $res = wfQuery( $sql, DB_READ ); 00307 while ( $row = wfFetchObject( $res ) ) { 00308 $this->mSet[] = $row; 00309 } 00310 wfFreeResult( $res ); 00311 if( count( $this->mSet ) > 0 ){ 00312 return $this->next(); 00313 } 00314 } 00315 return false; 00316 } 00317 } 00318 00319 # A simple MRU for general cacheing. 00320 00321 class MRUCache { 00322 /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti; 00323 /* private */ var $hits, $misses; 00324 00325 function MRUCache( $size, $purgefreq = -1 ) { 00326 // purgefreq is 1/10 of $size if not stated 00327 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq); 00328 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq); 00329 00330 $this->mSize = $size; 00331 $this->mMru = array(); 00332 $this->mCache = array(); 00333 $this->mPurgefreq = $purgefreq; 00334 $this->nexti = 1; 00335 print "purgefreq = " . $this->mPurgefreq . "\n"; 00336 } 00337 00338 function get( $key ){ 00339 if ( ! array_key_exists( $key, $this->mCache) ){ 00340 $this->misses++; 00341 return false; 00342 } 00343 $this->hits++; 00344 $this->mMru[$key] = $this->nexti++; 00345 return $this->mCache[$key]; 00346 } 00347 00348 function set( $key, $value ){ 00349 $this->mMru[$key] = $this->nexti++; 00350 $this->mCache[$key] = $value; 00351 00352 if($this->nexti % $this->mPurgefreq == 0) 00353 $this->purge(); 00354 } 00355 00356 function purge(){ 00357 $to_remove = count( $this->mMru ) - $this->mSize; 00358 if( $to_remove <= 0 ){ 00359 return; 00360 } 00361 asort( $this->mMru ); 00362 $removed = array_splice( $this->mMru, 0, $to_remove ); 00363 foreach( array_keys( $removed ) as $key ){ 00364 unset( $this->mCache[$key] ); 00365 } 00366 } 00367 00368 function getPerformance(){ 00369 $tot = $this->hits + $this->misses; 00370 if($tot > 0) 00371 return intval(100.0 * $this->hits / $tot); 00372 else 00373 return 0; 00374 } 00375 } 00376 00377 ?>

Generated on Tue Jun 29 23:40:06 2004 for Mediawiki by doxygen 1.3.7