Mediawiki: importUseModWiki.php Source File

00001 <?php 00002 00003 print "This script is obsolete!"; 00004 print "It is retained in the source here in case some of its 00005 code might be useful for ad-hoc conversion tasks, but it is 00006 not maintained and probably won't even work as is."; 00007 exit(); 00008 00009 /* 00010 Import data from a UseModWiki into a PediaWiki wiki 00011 2003-02-09 Brion VIBBER <brion@pobox.com> 00012 Based loosely on Magnus's code from 2001-2002 00013 00014 Pass one: collect data on links & title case, users 00015 Pass two: spit out SQL for 00016 Separately, be sure to run the link & index rebuilding scripts! 00017 00018 */ 00019 00020 /* globals 00021 */ 00022 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki"; 00023 $wgFieldSeparator = "\xb3"; # Some wikis may use different char 00024 $FS = $wgFieldSeparator ; 00025 $FS1 = $FS."1" ; 00026 $FS2 = $FS."2" ; 00027 $FS3 = $FS."3" ; 00028 00029 # Images to import 00030 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))'; 00031 00032 # Number of *seconds to add* to timestamp to get UTC/GMT 00033 #$wgTimezoneCorrection = 0; # GMT 00034 $wgTimezoneCorrection = 8*3600; # PST - California 00035 00036 # Other options... 00037 $historyonly = false; # Don't add converted revisions to cur table; just get old histories 00038 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision 00039 00040 /* Vary by language */ 00041 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4 00042 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" ); 00043 $talkending = "Talk"; 00044 $mediatext = "Media"; 00045 $conversionscript = "Conversion script"; 00046 $conversioncomment = "Automatic conversion"; 00047 $redirectcomment = "Automatic converion, moved to \$1"; 00048 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp 00049 00050 # Stats and caches 00051 $oldtitles = array(); 00052 $usercache = array(); 00053 $titlecache = array(); 00054 $linkcache = array(); 00055 00056 # Some oversimplified test types 00057 class Title { 00058 var $title, $namespace; 00059 function fromData( $namespace, $title ) { 00060 $x = new Title; 00061 $x->namespace = $namespace; 00062 $x->title = $title; 00063 return $x; 00064 } 00065 } 00066 00067 # See tests in importTests.php 00068 if( ! $testingonly ) { 00069 firstPass(); 00070 secondPass(); 00071 } 00072 00073 # ------------------------------------------------------------------------------ 00074 00075 /* First pass: 00076 Information please! 00077 */ 00078 function firstPass() 00079 { 00080 global $wgRootDirectory, $oldtitles; 00081 00082 $letters = array( 00083 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 00084 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 00085 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); 00086 foreach( $letters as $letter ) { 00087 firstPassDirectory( "$wgRootDirectory/page/$letter" ); 00088 } 00089 } 00090 00091 function firstPassDirectory( $dir ) 00092 { 00093 global $titlecache; 00094 00095 $mydir = opendir( $dir ); 00096 while( $entry = readdir( $mydir ) ) { 00097 if( $entry != '.' && $entry != '..' ) { 00098 if( is_dir( "$dir/$entry" ) ) { 00099 firstPassDirectory( "$dir/$entry" ); 00100 } 00101 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) { 00102 $titlecache[$title] = transformTitle( $m[1] ); 00103 countLinksFrom( $title ); 00104 } else { 00105 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n"; 00106 } 00107 } 00108 } 00109 00110 /* Second pass: 00111 make the dang SQL 00112 */ 00113 function secondPass() 00114 { 00115 global $titlecache, $usercache, $redirects; 00116 00117 foreach( $usercache as $oldname => $user ) { 00118 echo importUser( $oldname ); 00119 } 00120 foreach( $titlecache as $oldtitle => $newtitle ) { 00121 echo importPage( $oldtitle ); 00122 } 00123 00124 echo "\n-- Done!\n"; 00125 } 00126 00127 00128 # ------------------------------------------------------------------------------ 00129 00130 /* fetch_ functions 00131 Grab a given item from the database 00132 */ 00133 function fetchUser( $uid ) 00134 { 00135 global $FS,$FS2,$FS3, $wgRootDirectory; 00136 00137 $fname = $wgRootDirectory . "/pages/" . $title; 00138 if( !file_exists( $fname ) ) return false; 00139 00140 $data = splitHash( implode( "", file( $fname ) ) ); 00141 # enough? 00142 00143 return $data; 00144 } 00145 00146 function fetchPage( $title ) 00147 { 00148 global $FS,$FS2,$FS3, $wgRootDirectory; 00149 00150 $fname = $wgRootDirectory . "/pages/" . $title; 00151 if( !file_exists( $fname ) ) return false; 00152 00153 $page = splitHash( implode( "", file( $fname ) ) ); 00154 $section = splitHash( $FS2, $page["text_default"] ); 00155 $text = splitHash( $FS3, $section["data"] ); 00156 00157 return array ( "text" => $text["text"] , "summary" => $text["summary"] , 00158 "minor" => $text["minor"] , "ts" => $section["ts"] , 00159 "username" => $section["username"] , "host" => $section["host"] ) ; 00160 } 00161 00162 function fetchKeptPages( $title ) 00163 { 00164 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection; 00165 00166 $fname = $wgRootDirectory . "/keep/" . $title . ".kp"; 00167 if( !file_exists( $fname ) ) return array(); 00168 00169 $keptlist = explode( $FS1, implode( "", file( $fname ) ) ); 00170 array_shift( $keptlist ); # Drop the junk at beginning of file 00171 00172 $revisions = array(); 00173 foreach( $keptlist as $rev ) { 00174 $section = splitHash( $FS2, $rev ); 00175 $text = splitHash( $FS3, $section["data"] ); 00176 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) { 00177 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] , 00178 "minor" => $text["minor"] , "ts" => $section["ts"] , 00179 "username" => $section["username"] , "host" => $section["host"] ) ); 00180 } else { 00181 echo "-- skipped a bad old revision\n"; 00182 } 00183 } 00184 return $revisions; 00185 } 00186 00187 function splitHash ( $sep , $str ) { 00188 $temp = explode ( $sep , $str ) ; 00189 $ret = array () ; 00190 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) { 00191 $ret[$temp[$i]] = $temp[++$i] ; 00192 } 00193 return $ret ; 00194 } 00195 00196 00197 /* import_ functions 00198 Take a fetched item and produce SQL 00199 */ 00200 00201 /* importUser 00202 $uid is the UseMod user id number. 00203 The new ones will be assigned arbitrarily and are for internal use only. 00204 00205 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR 00206 */ 00207 function importUser( $uid ) 00208 { 00209 global $last_uid, $user_list, $wgTimestampCorrection; 00210 00211 return ""; 00212 00213 $stuff = fetchUser( $uid ); 00214 $last_uid++; 00215 00216 $name = wfStrencode( $stuff->username ); 00217 $hash = md5hash( $stuff->password ); # Doable? 00218 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1 00219 $hideminor = ($stuff['rcall'] ? 0 : 1); 00220 $options = "cols={$stuff['editcols']} 00221 rows={$stuff['editrows']} 00222 rcdays={$stuff['rcdays']} 00223 timecorrection={$tzoffset} 00224 hideminor={$hideminor} 00225 "; 00226 00227 $sql = "INSERT 00228 INTO user (user_id,user_name,user_password,user_options) 00229 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n"; 00230 return $sql; 00231 } 00232 00233 function checkUserCache( $name, $host ) 00234 { 00235 global $usercache; 00236 00237 if( $name ) { 00238 if( in_array( $name, $usercache ) ) { 00239 $userid = $usercache[$name]; 00240 } else { 00241 # If we haven't imported user accounts 00242 $userid = 0; 00243 } 00244 $username = wfStrencode( $name ); 00245 } else { 00246 $userid = 0; 00247 $username = wfStrencode( $host ); 00248 } 00249 return array( $userid, $username ); 00250 } 00251 00252 function importPage( $title ) 00253 { 00254 global $wgTimezoneCorrection, $titlecache, $usercache; 00255 global $conversionscript, $conversioncomment, $conversiontime; 00256 global $historyonly, $lasthistoryonly; 00257 00258 $page = fetchPage( $title ); 00259 00260 $newtext = wfStrencode( rewritePage( $title, $page->text ) ); 00261 $t = renamePage( $title ); 00262 $newtitle = wfStrencode( $t->title ); 00263 $namespace = $t->namespace; 00264 00265 # Current revision: 00266 $text = wfStrencode( $page->text ); 00267 $minor = ($page->minor ? 1 : 0); 00268 list( $userid, $username ) = checkUserCache( $page->username, $page->host ); 00269 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection ); 00270 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 ); 00271 $sql = "\n"; 00272 if( !$historyonly ) { 00273 $sql .= "INSERT 00274 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit) 00275 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n"; 00276 } 00277 $sql .= "INSERT 00278 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit) 00279 VALUES"; 00280 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n"; 00281 00282 # History 00283 if( !$lasthistoryonly ) { 00284 $revisions = fetchKeptPages( $title ); 00285 foreach( $revisions as $rev ) { 00286 $text = wfStrencode( $rev->text ); 00287 $minor = ($rev->minor ? 1 : 0); 00288 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host ); 00289 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection ); 00290 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n"; 00291 } 00292 } 00293 return $sql . $sqlfinal; 00294 } 00295 00296 00297 # Count up basic links 00298 function countLinksFrom( $title ) 00299 { 00300 $page = fetchPage( $title ); 00301 $page->text = preg_replace( 00302 '/<nowiki>.*<\/nowiki>/sDU', 00303 '', 00304 $page->text ); 00305 $page->text = preg_replace( 00306 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e', 00307 'countLinkTo( ucfirst( "$1" ) )', 00308 $page->text ); 00309 } 00310 00311 function countLinkTo( $title ) 00312 { 00313 global $linkcache; 00314 $t = transformTitle( $title ); 00315 $linkform = FreeToNormal( $t->title ); 00316 $x = $linkcache[$title]; 00317 if ( count ( $x ) ) { 00318 $y = $x[$linkform] ; 00319 if ( $y ) $y++; else $y = 1 ; 00320 $x[$linkform] = $y ; 00321 } else { 00322 $x = array ( $linkform => 1 ) ; 00323 } 00324 $linkcache[$title] = $x; 00325 } 00326 00327 # Preferentially change case 00328 function renamePage( $title ) 00329 { 00330 global $linkcache; 00331 $t = transformTitle( $title ); 00332 00333 # We want to use the most frequently linked-to form as the title 00334 $maxcount = 0 ; $maxform = $t->title ; 00335 foreach ( $linkcache[$title] as $linkform => $count ) { 00336 if ( $count > $maxcount ) { 00337 $maxcount = $count ; 00338 $maxform = $linkform ; 00339 } 00340 } 00341 if( $maxform != $t->title) { 00342 doRenamePage( $t, $maxform ); 00343 } 00344 } 00345 00346 function doRenamePage( $title, $maxform ) 00347 { 00348 global $linkcache, $redirectcomment, $conversionscript, $conversiontime; 00349 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit) 00350 VALUES "; 00351 $redirsql = array(); 00352 foreach( $linkcache[$title] as $linkform => $count ) { 00353 if( $linkform != $maxform ) { 00354 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) ); 00355 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" ); 00356 } 00357 } 00358 $sql .= implode( ",\n\t", $redirsql ) . ";\n"; 00359 return $sql; 00360 } 00361 00362 # Account for syntax changes 00363 function rewritePage( $title, $text ) 00364 { 00365 # ... 00366 $text = removeTalkLink( $text ); 00367 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD', 00368 'rewritePageBits( $title, "$1")', 00369 $text ); 00370 return $text; 00371 } 00372 00373 function rewritePageBits( $title, $text ) { 00374 $text = fixSubpages( $title, $text ); 00375 $text = fixMedialinks( $text ); 00376 $text = fixImagelinks( $text ); 00377 return $text; 00378 } 00379 00380 function removeTalkLink( &$text ) { 00381 global $talkending; 00382 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text ); 00383 } 00384 00385 function fixSubpages( $text, &$title ) { 00386 $old = preg_quote( $text ); 00387 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>", 00388 "$1[[$title/$2|/$2]]", $text ); 00389 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e", 00390 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text ); 00391 $text = preg_replace( "<\[\[/(.*?)\]\]>e", 00392 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text ); 00393 return $text; 00394 } 00395 00396 function fixImagelinks( &$text ) { 00397 global $imageimport, $namespaces; 00398 return preg_replace( "/$imageimport/e", 00399 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"', 00400 $text ); 00401 } 00402 00403 function fixMedialinks( &$text ) { 00404 global $imageimport, $mediatext; 00405 $text = preg_replace( "/\[$imageimport\]/e", 00406 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"', 00407 $text ); 00408 return preg_replace( "/\[$imageimport (.+?)\]/e", 00409 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"', 00410 $text ); 00411 } 00412 00413 function fetchMediaFile( $url, $filename ) 00414 { 00415 # Copy an image file into local upload space 00416 # FIXME 00417 return ucfirst( $filename ); 00418 } 00419 00420 # Simple move of talk pages, etc 00421 function transformTitle( $title, $dorename = false ) 00422 { 00423 global $talkending; 00424 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) { 00425 $thetitle = $m[1]; 00426 $namespace = 1; 00427 } else { 00428 $thetitle = $title; 00429 $namespace = 0; 00430 } 00431 return Title::fromData( $namespace, $thetitle ); 00432 } 00433 00434 # Translated out of old usemod wiki... 00435 function FreeToNormal ( $id , $FreeUpper = true ) { 00436 $id = str_replace ( " ", "_", $id ) ; 00437 $id = ucfirst($id); 00438 if (strstr($id, '_') != false) { # Quick check for any space/underscores 00439 $id = preg_replace ( '/__+/' , "_" , $id ) ; 00440 $id = preg_replace ( '/^_/' , "", $id ) ; 00441 $id = preg_replace ( '/_$/' , "", $id ) ; 00442 #if ($UseSubpage) { 00443 $id = preg_replace ( '|_/|', "/" , $id ) ; 00444 $id = preg_replace ( '|/_|', "/" , $id ) ; 00445 #} 00446 } 00447 if ($FreeUpper) { 00448 # Note that letters after ' are *not* capitalized 00449 if (preg_match ( '|[-_.,/][a-z]|' , $id ) ) { # Quick check for non-canon 00450 $id = preg_replace ( '|([-_.,/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ; 00451 } 00452 } 00453 return $id; 00454 } 00455 00456 # Whee! 00457 function recodeInput( $text ) 00458 { 00459 return $text; 00460 } 00461 00462 function wfUnix2Timestamp( $unixtime ) { 00463 return gmdate( "YmdHis", $timestamp ); 00464 } 00465 00466 function wfTimestamp2Unix( $ts ) 00467 { 00468 return gmmktime( ( (int)substr( $ts, 8, 2) ), 00469 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ), 00470 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ), 00471 (int)substr( $ts, 0, 4 ) ); 00472 } 00473 00474 ?>