00001 <?php
00002
00003 print
"This script is obsolete!";
00004 print
"It is retained in the source here in case some of its
00005 code might be useful for ad-hoc conversion tasks, but it is
00006
not maintained and probably won't even work as is.";
00007
exit();
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 $wgRootDirectory =
"/home/brion/vikio/wiki-ca/lib-http/db/wiki";
00023 $wgFieldSeparator =
"\xb3"; # Some wikis may use different
char
00024 $FS =
$wgFieldSeparator ;
00025 $FS1 =
$FS.
"1" ;
00026 $FS2 =
$FS.
"2" ;
00027 $FS3 =
$FS.
"3" ;
00028
00029
# Images to import
00030 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
00031
00032
# Number of *seconds to add* to timestamp to get UTC/GMT
00033
#$wgTimezoneCorrection = 0; # GMT
00034 $wgTimezoneCorrection = 8*3600; # PST - California
00035
00036
# Other options...
00037 $historyonly =
false; # Don't add converted revisions to cur table; just get old histories
00038 $lasthistoryonly =
false; # Only add the _original_ form of the _current_ revision
00039
00040
00041 $namespaces = array( 0 =>
"", 1 =>
"Talk:", 2 =>
"User:", 3 =>
"User_talk:", 4
00042 =>
"Wikipedia:", 5 =>
"Wikipedia_talk:", 6 =>
"Image:", 7 =>
"Image_talk:" );
00043 $talkending =
"Talk";
00044 $mediatext =
"Media";
00045 $conversionscript =
"Conversion script";
00046 $conversioncomment =
"Automatic conversion";
00047 $redirectcomment =
"Automatic converion, moved to \$1";
00048 $conversiontime = gmdate(
"YmdHis" ); # Conversions will be marked with
this timestamp
00049
00050
# Stats and caches
00051 $oldtitles = array();
00052 $usercache = array();
00053 $titlecache = array();
00054 $linkcache = array();
00055
00056
# Some oversimplified test types
00057 class Title {
00058 var
$title,
$namespace;
00059 function
fromData( $
namespace, $title ) {
00060 $x =
new Title;
00061 $x->namespace =
$namespace;
00062 $x->title =
$title;
00063
return $x;
00064 }
00065 }
00066
00067
# See tests in importTests.php
00068
if( !
$testingonly ) {
00069
firstPass();
00070
secondPass();
00071 }
00072
00073
# ------------------------------------------------------------------------------
00074
00075
00076
00077
00078 function
firstPass()
00079 {
00080 global
$wgRootDirectory,
$oldtitles;
00081
00082 $letters = array(
00083
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
00084
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
00085
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z', 'other' );
00086 foreach( $letters as $letter ) {
00087
firstPassDirectory(
"$wgRootDirectory/page/$letter" );
00088 }
00089 }
00090
00091 function
firstPassDirectory( $dir )
00092 {
00093 global
$titlecache;
00094
00095 $mydir = opendir( $dir );
00096
while( $entry = readdir( $mydir ) ) {
00097
if( $entry !=
'.' && $entry != '..' ) {
00098
if( is_dir(
"$dir/$entry" ) ) {
00099
firstPassDirectory(
"$dir/$entry" );
00100 }
00101 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
00102
$titlecache[
$title] =
transformTitle( $m[1] );
00103
countLinksFrom( $
title );
00104 }
else {
00105 echo
"-- File '$entry' doesn't seem to contain an article. Skipping.\n";
00106 }
00107 }
00108 }
00109
00110
00111
00112
00113 function
secondPass()
00114 {
00115 global
$titlecache,
$usercache, $redirects;
00116
00117 foreach( $usercache as $oldname => $
user ) {
00118 echo
importUser( $oldname );
00119 }
00120 foreach( $titlecache as $oldtitle => $newtitle ) {
00121 echo
importPage( $oldtitle );
00122 }
00123
00124 echo
"\n-- Done!\n";
00125 }
00126
00127
00128
# ------------------------------------------------------------------------------
00129
00130
00131
00132
00133 function
fetchUser( $uid )
00134 {
00135 global
$FS,
$FS2,
$FS3,
$wgRootDirectory;
00136
00137 $fname =
$wgRootDirectory .
"/pages/" .
$title;
00138
if( !file_exists( $fname ) )
return false;
00139
00140 $data =
splitHash( implode(
"", file( $fname ) ) );
00141
# enough?
00142
00143
return $data;
00144 }
00145
00146 function
fetchPage( $title )
00147 {
00148 global
$FS,
$FS2,
$FS3,
$wgRootDirectory;
00149
00150 $fname =
$wgRootDirectory .
"/pages/" .
$title;
00151
if( !file_exists( $fname ) )
return false;
00152
00153
$page =
splitHash( implode(
"", file( $fname ) ) );
00154 $section =
splitHash( $FS2, $
page[
"text_default"] );
00155 $text =
splitHash( $FS3, $section[
"data"] );
00156
00157
return array (
"text" => $text[
"text"] ,
"summary" => $text[
"summary"] ,
00158
"minor" => $text[
"minor"] ,
"ts" => $section[
"ts"] ,
00159
"username" => $section[
"username"] ,
"host" => $section[
"host"] ) ;
00160 }
00161
00162 function
fetchKeptPages( $title )
00163 {
00164 global
$FS,
$FS2,
$FS3,
$wgRootDirectory,
$wgTimezoneCorrection;
00165
00166 $fname =
$wgRootDirectory .
"/keep/" .
$title .
".kp";
00167
if( !file_exists( $fname ) )
return array();
00168
00169 $keptlist = explode( $FS1, implode(
"", file( $fname ) ) );
00170 array_shift( $keptlist ); # Drop the junk at beginning of file
00171
00172 $revisions = array();
00173 foreach( $keptlist as $rev ) {
00174 $section =
splitHash( $FS2, $rev );
00175 $text =
splitHash( $FS3, $section[
"data"] );
00176
if ( $text[
"text"] && $text[
"minor"] !=
"" && ( $section[
"ts"]*1 > 0 ) ) {
00177 array_push( $revisions, array (
"text" => $text[
"text"] ,
"summary" => $text[
"summary"] ,
00178
"minor" => $text[
"minor"] ,
"ts" => $section[
"ts"] ,
00179
"username" => $section[
"username"] ,
"host" => $section[
"host"] ) );
00180 }
else {
00181 echo
"-- skipped a bad old revision\n";
00182 }
00183 }
00184
return $revisions;
00185 }
00186
00187 function
splitHash ( $sep , $str ) {
00188 $temp = explode ( $sep , $str ) ;
00189 $ret = array () ;
00190
for (
$i = 0;
$i+1 < count ( $temp ) ;
$i++ ) {
00191 $ret[$temp[
$i]] = $temp[++
$i] ;
00192 }
00193
return $ret ;
00194 }
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207 function
importUser( $uid )
00208 {
00209 global $last_uid, $user_list, $wgTimestampCorrection;
00210
00211
return "";
00212
00213 $stuff =
fetchUser( $uid );
00214 $last_uid++;
00215
00216 $name =
wfStrencode( $stuff->username );
00217 $hash = md5hash( $stuff->password ); # Doable?
00218 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
00219 $hideminor = ($stuff['rcall'] ? 0 : 1);
00220
$options =
"cols={$stuff['editcols']}
00221
rows={$stuff['editrows']}
00222
rcdays={$stuff['rcdays']}
00223
timecorrection={$tzoffset}
00224
hideminor={$hideminor}
00225
";
00226
00227
$sql =
"INSERT
00228
INTO user (user_id,user_name,user_password,user_options)
00229
VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
00230
return $sql;
00231 }
00232
00233 function
checkUserCache( $name, $host )
00234 {
00235 global
$usercache;
00236
00237
if( $name ) {
00238
if( in_array( $
name, $usercache ) ) {
00239 $userid =
$usercache[$name];
00240 }
else {
00241
# If we haven't imported user accounts
00242
$userid = 0;
00243 }
00244 $username =
wfStrencode( $
name );
00245 }
else {
00246 $userid = 0;
00247 $username =
wfStrencode( $host );
00248 }
00249
return array( $userid, $username );
00250 }
00251
00252 function
importPage( $title )
00253 {
00254 global
$wgTimezoneCorrection,
$titlecache,
$usercache;
00255 global
$conversionscript,
$conversioncomment,
$conversiontime;
00256 global
$historyonly,
$lasthistoryonly;
00257
00258
$page =
fetchPage( $
title );
00259
00260 $newtext =
wfStrencode(
rewritePage( $
title, $
page->text ) );
00261 $t =
renamePage( $
title );
00262 $newtitle =
wfStrencode( $t->title );
00263 $namespace = $t->namespace;
00264
00265
# Current revision:
00266
$text =
wfStrencode( $
page->text );
00267 $minor = (
$page->minor ? 1 : 0);
00268 list( $userid, $username ) =
checkUserCache( $
page->username, $
page->host );
00269 $timestamp =
wfUnix2Timestamp( $
page->timestamp + $wgTimezoneCorrection );
00270 $redirect = ( preg_match( '/^#REDIRECT/', $
page->text ) ? 1 : 0 );
00271
$sql =
"\n";
00272
if( !
$historyonly ) {
00273
$sql .=
"INSERT
00274
INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
00275
VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
00276 }
00277
$sql .=
"INSERT
00278
INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
00279
VALUES";
00280 $sqlfinal =
"\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
00281
00282
# History
00283
if( !
$lasthistoryonly ) {
00284 $revisions =
fetchKeptPages( $
title );
00285 foreach( $revisions as $rev ) {
00286 $text =
wfStrencode( $rev->text );
00287 $minor = ($rev->minor ? 1 : 0);
00288 list( $userid, $username ) =
checkUserCache( $rev->username, $rev->host );
00289 $timestamp =
wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
00290
$sql .=
"\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
00291 }
00292 }
00293
return $sql . $sqlfinal;
00294 }
00295
00296
00297
# Count up basic links
00298 function
countLinksFrom( $title )
00299 {
00300
$page =
fetchPage( $
title );
00301
$page->text = preg_replace(
00302 '/<nowiki>.*<\/nowiki>/sDU',
00303 '',
00304 $
page->text );
00305
$page->text = preg_replace(
00306 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
00307 '
countLinkTo( ucfirst(
"$1" ) )',
00308 $
page->text );
00309 }
00310
00311 function
countLinkTo( $title )
00312 {
00313 global
$linkcache;
00314 $t =
transformTitle( $
title );
00315 $linkform =
FreeToNormal( $t->title );
00316 $x =
$linkcache[
$title];
00317
if ( count ( $x ) ) {
00318 $y = $x[$linkform] ;
00319
if ( $y ) $y++;
else $y = 1 ;
00320 $x[$linkform] = $y ;
00321 }
else {
00322 $x = array ( $linkform => 1 ) ;
00323 }
00324
$linkcache[
$title] = $x;
00325 }
00326
00327
# Preferentially change case
00328 function
renamePage( $title )
00329 {
00330 global
$linkcache;
00331 $t =
transformTitle( $
title );
00332
00333
# We want to use the most frequently linked-to form as the title
00334
$maxcount = 0 ; $maxform = $t->title ;
00335 foreach ( $linkcache[$
title] as $linkform => $count ) {
00336
if (
$count > $maxcount ) {
00337 $maxcount =
$count ;
00338 $maxform = $linkform ;
00339 }
00340 }
00341
if( $maxform != $t->title) {
00342
doRenamePage( $t, $maxform );
00343 }
00344 }
00345
00346 function
doRenamePage( $title, $maxform )
00347 {
00348 global
$linkcache,
$redirectcomment,
$conversionscript,
$conversiontime;
00349
$sql =
"INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
00350
VALUES ";
00351 $redirsql = array();
00352 foreach( $linkcache[$
title] as $linkform => $count ) {
00353
if( $linkform != $maxform ) {
00354
$comment =
wfStrencode( str_replace(
"$1", $maxform, $redirectcomment ) );
00355 array_push( $redirsql,
"($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
00356 }
00357 }
00358
$sql .= implode(
",\n\t", $redirsql ) .
";\n";
00359
return $sql;
00360 }
00361
00362
# Account for syntax changes
00363 function
rewritePage( $title, $text )
00364 {
00365
# ...
00366
$text =
removeTalkLink( $text );
00367 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
00368 '
rewritePageBits( $
title,
"$1")',
00369 $text );
00370
return $text;
00371 }
00372
00373 function
rewritePageBits( $title, $text ) {
00374 $text =
fixSubpages( $
title, $text );
00375 $text =
fixMedialinks( $text );
00376 $text =
fixImagelinks( $text );
00377
return $text;
00378 }
00379
00380 function
removeTalkLink( &$text ) {
00381 global
$talkending;
00382
return preg_replace(
"[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
00383 }
00384
00385 function
fixSubpages( $text, &$title ) {
00386 $old = preg_quote( $text );
00387 $text = preg_replace(
"<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
00388
"$1[[$title/$2|/$2]]", $text );
00389 $text = preg_replace(
"<\[\[/([^|]*?)\]\]>e",
00390
"\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
00391 $text = preg_replace(
"<\[\[/(.*?)\]\]>e",
00392
"\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
00393
return $text;
00394 }
00395
00396 function
fixImagelinks( &$text ) {
00397 global
$imageimport,
$namespaces;
00398
return preg_replace(
"/$imageimport/e",
00399 '
"[[{$namespaces[6]}" .
fetchMediaFile(
"$1",
"$2" ) .
"]]"',
00400 $text );
00401 }
00402
00403 function
fixMedialinks( &$text ) {
00404 global
$imageimport,
$mediatext;
00405 $text = preg_replace(
"/\[$imageimport\]/e",
00406 '
"[[$mediatext:" .
fetchMediaFile(
"$1",
"$2" ) .
"]]"',
00407 $text );
00408
return preg_replace(
"/\[$imageimport (.+?)\]/e",
00409 '
"[[$mediatext:" .
fetchMediaFile(
"$1",
"$2" ) .
"|$3]]"',
00410 $text );
00411 }
00412
00413 function
fetchMediaFile( $url, $filename )
00414 {
00415
# Copy an image file into local upload space
00416
# FIXME
00417
return ucfirst( $filename );
00418 }
00419
00420
# Simple move of talk pages, etc
00421 function
transformTitle( $title, $dorename =
false )
00422 {
00423 global
$talkending;
00424
if( preg_match(
"/^(.+)[ _]?\\/[ _]?($talkending)/i", $
title, $m ) ) {
00425 $thetitle = $m[1];
00426 $namespace = 1;
00427 }
else {
00428 $thetitle =
$title;
00429 $namespace = 0;
00430 }
00431
return Title::fromData( $
namespace, $thetitle );
00432 }
00433
00434
# Translated out of old usemod wiki...
00435 function
FreeToNormal ( $
id , $FreeUpper =
true ) {
00436 $id = str_replace (
" ",
"_", $
id ) ;
00437 $id = ucfirst($
id);
00438
if (strstr($
id,
'_') !=
false) { # Quick
check for any space/underscores
00439 $id = preg_replace ( '/__+/' ,
"_" , $
id ) ;
00440 $id = preg_replace ( '/^_/' ,
"", $
id ) ;
00441 $id = preg_replace ( '/_$/' ,
"", $
id ) ;
00442
#if ($UseSubpage) {
00443
$id = preg_replace ( '|_/|',
"/" , $
id ) ;
00444 $id = preg_replace ( '|/_|',
"/" , $
id ) ;
00445
#}
00446
}
00447
if ($FreeUpper) {
00448
# Note that letters after ' are *not* capitalized
00449
if (preg_match ( '|[-_.,\(\)/][a-z]|' , $
id ) ) { # Quick
check for non-canon
00450 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '
"$1" . strtoupper(
"$2")' , $
id ) ;
00451 }
00452 }
00453
return $id;
00454 }
00455
00456
# Whee!
00457
function recodeInput( $text )
00458 {
00459
return $text;
00460 }
00461
00462 function
wfUnix2Timestamp( $unixtime ) {
00463
return gmdate(
"YmdHis", $timestamp );
00464 }
00465
00466 function
wfTimestamp2Unix( $ts )
00467 {
00468
return gmmktime( ( (
int)substr( $ts, 8, 2) ),
00469 (
int)substr( $ts, 10, 2 ), (
int)substr( $ts, 12, 2 ),
00470 (
int)substr( $ts, 4, 2 ), (
int)substr( $ts, 6, 2 ),
00471 (
int)substr( $ts, 0, 4 ) );
00472 }
00473
00474 ?>