00001 <?php
00002
00003 die(
"rebuildLinks.inc needs to be updated for the new schema\n" );
00004
00005
# Functions for rebuilding the link tracking tables; must
00006
# be included within a script that also includes the Setup.
00007
# See rebuildlinks.php, for example.
00008
#
00009
00010
# Buffer this many rows before inserting them all in one sweep. More
00011
# than about 1000 will probably not increase speed significantly on
00012
# most setups.
00013 $rowbuf_size = 1000;
00014
00015 function
rebuildLinkTables()
00016 {
00017 error_reporting (E_ALL);
00018 global
$wgLang,
$wgLinkCache,
$rowbuf_size;
00019
00020 print
"This script may take several hours to complete. If you abort during that time,\n";
00021 print
"your wiki will be in an inconsistent state. If you are going to abort, this is\n";
00022 print
"the time to do it.\n\n";
00023 print
"Press control-c to abort (will proceed automatically in 15 seconds)\n";
00024 sleep(15);
00025
00026
$count = 0;
00027 print
"Rebuilding link tables.\n";
00028
00029 print
"Setting AUTOCOMMIT=1\n";
00030
wfQuery(
"SET SESSION AUTOCOMMIT=1",
DB_WRITE);
00031
00032 print
"Extracting often used data from cur (may take a few minutes)\n";
00033
$sql =
"CREATE TEMPORARY TABLE cur_fast SELECT cur_namespace, cur_title, cur_id FROM cur";
00034
wfQuery( $sql,
DB_WRITE );
00035
$sql =
"ALTER TABLE cur_fast ADD INDEX(cur_namespace, cur_title)";
00036
wfQuery( $sql,
DB_WRITE );
00037
00038 print
"Locking tables\n";
00039
$sql =
"LOCK TABLES cur READ, cur_fast READ, interwiki READ, user_newtalk READ, " .
00040
"links WRITE, brokenlinks WRITE, imagelinks WRITE";
00041
wfQuery( $sql,
DB_WRITE );
00042
00043
00044 print
"Deleting old data in links table.\n";
00045
$sql =
"DELETE FROM links";
00046
wfQuery( $sql,
DB_WRITE );
00047
00048 print
"Deleting old data in brokenlinks table.\n";
00049
$sql =
"DELETE FROM brokenlinks";
00050
wfQuery( $sql,
DB_WRITE );
00051
00052 print
"Deleting old data in imagelinks table.\n";
00053
$sql =
"DELETE FROM imagelinks";
00054
wfQuery( $sql,
DB_WRITE );
00055
00056 print
"Finding number of articles to process... ";
00057
$sql =
"SELECT COUNT(*) as count FROM cur";
00058
$res =
wfQuery( $sql,
DB_READ );
00059 $obj =
wfFetchObject( $res );
00060 $total = $obj->count;
00061 print
"$total\n";
00062
00063 print
"Finding highest article id\n";
00064
$sql =
"SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
00065
$res =
wfQuery( $sql,
DB_READ );
00066 $obj =
wfFetchObject( $res );
00067
00068 $cur_pulser =
new SelectPulser(
"SELECT cur_id,cur_namespace,cur_title,cur_text " .
00069
"FROM cur WHERE cur_id ",
00070 $obj->min, $obj->max, 100);
00071
00072 $brokenlinks_inserter =
new InsertBuffer(
00073
"INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
00074
00075 $links_inserter =
new InsertBuffer(
00076
"INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
00077
00078 $imagelinks_inserter =
new InsertBuffer(
"INSERT IGNORE INTO imagelinks ".
00079
"(il_from,il_to) VALUES ", $rowbuf_size);
00080
00081 print
"Starting processing\n";
00082
00083 $ins =
$wgLang->getNsText( Namespace::getImage() );
00084 $inslen = strlen($ins)+1;
00085
00086 $tc = Title::legalChars();
00087
00088 $titleCache =
new MRUCache( 10000 );
00089 $titlecount = 0;
00090 $start_time = time();
00091
00092
while (
$row = $cur_pulser->next() ) {
00093
00094 $from_id = intval($row->cur_id);
00095 $ns =
$wgLang->getNsText( $row->cur_namespace );
00096 $from_full_title =
$row->cur_title;
00097
if (
"" != $ns ) {
00098 $from_full_title =
"$ns:{$from_full_title}";
00099 }
00100 $from_full_title_with_slashes = addslashes( $from_full_title );
00101 $text =
$row->cur_text;
00102
00103 $numlinks = preg_match_all(
"/\\[\\[([{$tc}]+)(]|\\|)/", $text,
00104 $m, PREG_PATTERN_ORDER );
00105
00106 $seen_dbtitles = array();
00107 $titles_ready_for_insertion = array();
00108 $titles_needing_curdata = array();
00109 $titles_needing_curdata_pos = array();
00110 $links_corresponding_to_titles = array();
00111
00112
for (
$i = 0 ;
$i < $numlinks; ++
$i ) {
00113 $link = $m[1][
$i];
00114
if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
00115
# an URL link; not for us!
00116
continue;
00117 }
00118
00119
# FIXME: Handle subpage links
00120
$nt = $titleCache->get( $link );
00121
if( $nt !=
false ){
00122
00123 $nt_key = $nt->getDBkey() . $nt->getNamespace();
00124
if( isset( $seen_dbtitles[$nt_key] ) )
00125
continue;
00126 $seen_dbtitles[$nt_key] = 1;
00127
00128 $titles_ready_for_insertion[] = $nt;
00129 }
else {
00130 $nt = Title::newFromText( $link );
00131
if (! $nt) {
00132
00133
continue;
00134 }
00135
00136
00137 $nt_key = $nt->getDBkey() . $nt->getNamespace();
00138
if( isset( $seen_dbtitles[$nt_key] ) )
00139
continue;
00140 $seen_dbtitles[$nt_key] = 1;
00141
00142
if( $nt->getInterwiki() !=
"" ) {
00143
# Interwiki links are not stored in the link tables
00144
continue;
00145 }
00146
if( $nt->getNamespace() ==
Namespace::getSpecial() ) {
00147
# Special links not stored in link tables
00148
continue;
00149 }
00150
if( $nt->getNamespace() ==
Namespace::getMedia() ) {
00151
# treat media: links as image: links
00152
$nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
00153 }
00154 $nt->mArticleID = 0;
00155
00156 $pos = array_push($titles_needing_curdata, $nt) - 1;
00157 $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos;
00158 $links_corresponding_to_titles[] = $link;
00159 unset( $link );
00160 }
00161 }
00162
00163
00164
if ( count( $titles_needing_curdata ) > 0 ){
00165 $parts = array();
00166 foreach ($titles_needing_curdata as $nt ) {
00167 $parts[] =
" (cur_namespace = " . $nt->getNamespace() .
" AND " .
00168
"cur_title='" .
wfStrencode( $nt->getDBkey() ) .
"')";
00169 }
00170
$sql =
"SELECT cur_namespace, cur_title, cur_id FROM cur_fast WHERE " .
00171 implode(
" OR ", $parts);
00172
$res =
wfQuery( $sql,
DB_WRITE );
00173
while(
$row =
wfFetchObject( $res ) ){
00174 $pos = $titles_needing_curdata_pos[
$row->cur_title .
$row->cur_namespace];
00175 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
00176 }
00177
for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
00178 $tmplink = $links_corresponding_to_titles[$k];
00179 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
00180 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
00181 }
00182 }
00183
00184 foreach ( $titles_ready_for_insertion as $nt ) {
00185 $dest_noslashes = $nt->getPrefixedDBkey();
00186 $dest = addslashes( $dest_noslashes );
00187 $dest_id = $nt->getArticleID();
00188 $from = $from_full_title_with_slashes;
00189
00190
# print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
00191
00192
if ( 0 == strncmp(
"$ins:", $dest_noslashes, $inslen ) ) {
00193 $iname = addslashes( substr( $dest_noslashes, $inslen ) );
00194 $imagelinks_inserter->insert(
"('{$from}','{$iname}')" );
00195 }
else if ( 0 == $dest_id ) {
00196 $brokenlinks_inserter->insert(
"({$from_id},'{$dest}')" );
00197 }
else {
00198 $links_inserter->insert(
"('{$from}',{$dest_id})" );
00199 }
00200 $titlecount++;
00201 }
00202
00203
if ( (
$count % 20 ) == 0 )
00204 print
".";
00205
00206
if ( ( ++
$count % 1000 ) == 0 ) {
00207 $dt = time() - $start_time;
00208 $start_time = time();
00209 $rps =
persec(1000, $dt);
00210 $tps =
persec($titlecount, $dt);
00211 $titlecount = 0;
00212 print
"\n$count of $total articles scanned ({$rps} articles ".
00213
"and {$tps} titles per second)\n";
00214 print
"Title cache hits: " . $titleCache->getPerformance() .
"%\n";
00215
00216 }
00217
00218 }
00219
00220 print
"\nFlushing insertion buffers...";
00221 $imagelinks_inserter->flush();
00222 $links_inserter->flush();
00223 $brokenlinks_inserter->flush();
00224 print
"ok\n";
00225
00226 print
"$count articles scanned.\n";
00227
00228
$sql =
"UNLOCK TABLES";
00229
wfQuery( $sql,
DB_WRITE );
00230 print
"Done\n";
00231 }
00232
00233 function
persec($n, $t){
00234
if(
$n == 0)
00235
return "zero";
00236
if($t == 0)
00237
return "lots of";
00238
return intval($
n/$t);
00239 }
00240
00241
# InsertBuffer increases performance slightly by inserting many rows
00242
# at once. The gain is small (<5%) when running against a local, idle
00243
# database, but may be significant in other circumstances. It also
00244
# limits the number of inserted rows uppwards, which should avoid
00245
# problems with huge articles and certain mysql settings that limits
00246
# the size of queries. It's also convenient.
00247
00248 class InsertBuffer {
00249 var
$mBuf,
$mSql,
$mBufcount,
$mMaxsize;
00250
00251 function
InsertBuffer( $sql, $bufsize ){
00252 $this->mSql =
$sql;
00253 $this->mBuf = array();
00254 $this->mBufcount = 0;
00255 $this->mMaxsize = $bufsize;
00256 }
00257
00258 function
insert( $value ){
00259
00260 $this->mBuf[] = $value;
00261 $this->mBufcount++;
00262
if($this->mBufcount > $this->mMaxsize){
00263 $this->
flush();
00264 }
00265 }
00266
00267 function
flush(){
00268
if( $this->mBufcount > 0 ){
00269
$sql = $this->mSql . implode(
",", $this->mBuf);
00270
wfQuery( $sql,
DB_WRITE );
00271 $this->mBuf = array();
00272 $this->mBufcount = 0;
00273
00274 }
00275 }
00276
00277 }
00278
00279
# Select parts from a large table by using the "BETWEEN X AND Y"
00280
# operator on the id column. Avoids buffering the whole thing in
00281
# RAM. It's also convenient.
00282
00283 class SelectPulser {
00284 var
$mSql,
$mSetsize,
$mPos,
$mMax,
$mSet;
00285
00286 function
SelectPulser( $sql, $min, $max, $setsize) {
00287 $this->mSql =
$sql;
00288 $this->mSet = array();
00289 $this->mPos = $min;
00290 $this->mMax = $max;
00291 $this->mSetsize = $setsize;
00292 }
00293
00294 function
next(){
00295 $result = current( $this->mSet );
00296
next( $this->mSet );
00297
if(
false !== $result ){
00298
return $result;
00299 }
00300
while( $this->mPos <= $this->mMax ){
00301 $this->mSet = array();
00302
$sql = $this->mSql .
" BETWEEN " . $this->mPos .
00303
" AND " . ($this->mPos + $this->mSetsize - 1);
00304 $this->mPos += $this->mSetsize;
00305
00306
$res =
wfQuery( $sql,
DB_READ );
00307
while (
$row =
wfFetchObject( $res ) ) {
00308 $this->mSet[] =
$row;
00309 }
00310
wfFreeResult( $res );
00311
if( count( $this->mSet ) > 0 ){
00312
return $this->
next();
00313 }
00314 }
00315
return false;
00316 }
00317 }
00318
00319
# A simple MRU for general cacheing.
00320
00321 class MRUCache {
00322 var
$mMru,
$mCache,
$mSize,
$mPurgefreq,
$nexti;
00323 var
$hits,
$misses;
00324
00325 function
MRUCache( $size, $purgefreq = -1 ) {
00326
00327 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
00328 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
00329
00330 $this->mSize = $size;
00331 $this->mMru = array();
00332 $this->mCache = array();
00333 $this->mPurgefreq = $purgefreq;
00334 $this->nexti = 1;
00335 print
"purgefreq = " . $this->mPurgefreq .
"\n";
00336 }
00337
00338 function
get( $key ){
00339
if ( ! array_key_exists( $key, $this->mCache) ){
00340 $this->misses++;
00341
return false;
00342 }
00343 $this->hits++;
00344 $this->mMru[$key] = $this->nexti++;
00345
return $this->mCache[$key];
00346 }
00347
00348 function
set( $key, $value ){
00349 $this->mMru[$key] = $this->nexti++;
00350 $this->mCache[$key] = $value;
00351
00352
if($this->nexti % $this->mPurgefreq == 0)
00353 $this->
purge();
00354 }
00355
00356 function
purge(){
00357 $to_remove = count( $this->mMru ) - $this->mSize;
00358
if( $to_remove <= 0 ){
00359
return;
00360 }
00361 asort( $this->mMru );
00362 $removed = array_splice( $this->mMru, 0, $to_remove );
00363 foreach( array_keys( $removed ) as $key ){
00364 unset( $this->mCache[$key] );
00365 }
00366 }
00367
00368 function
getPerformance(){
00369 $tot = $this->hits + $this->misses;
00370
if($tot > 0)
00371
return intval(100.0 * $this->hits / $tot);
00372
else
00373
return 0;
00374 }
00375 }
00376
00377 ?>