00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 $wgImportEncoding =
"CP1252";
00019 $wgRootDirectory =
"/home/usemod/wiki-ia/lib-http/db/wiki";
00020
00021
00022 $wgFieldSeparator =
"\xb3"; # Some wikis may use different
char
00023 $FS =
$wgFieldSeparator ;
00024 $FS1 =
$FS.
"1" ;
00025 $FS2 =
$FS.
"2" ;
00026 $FS3 =
$FS.
"3" ;
00027
00028 $conversiontime =
wfTimestampNow(); # Conversions will be marked with
this timestamp
00029 $usercache = array();
00030
00031
wfSeedRandom();
00032
importPages();
00033
00034
# ------------------------------------------------------------------------------
00035
00036 function
importPages()
00037 {
00038 global
$wgRootDirectory;
00039
00040 $letters = array(
00041
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I',
00042
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R',
00043
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z', 'other' );
00044 foreach( $letters as $letter ) {
00045 $dir =
"$wgRootDirectory/page/$letter";
00046
if( is_dir( $dir ) )
00047
importPageDirectory( $dir );
00048 }
00049 }
00050
00051 function
importPageDirectory( $dir, $prefix =
"" )
00052 {
00053 echo
"\n-- Checking page directory $dir\n";
00054 $mydir = opendir( $dir );
00055
while( $entry = readdir( $mydir ) ) {
00056
if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
00057 echo
importPage( $prefix . $m[1] );
00058 }
else {
00059
if( is_dir(
"$dir/$entry" ) ) {
00060
if( $entry !=
'.' && $entry != '..' ) {
00061
importPageDirectory(
"$dir/$entry",
"$entry/" );
00062 }
00063 }
else {
00064 echo
"-- File '$entry' doesn't seem to contain an article. Skipping.\n";
00065 }
00066 }
00067 }
00068 }
00069
00070
00071
# ------------------------------------------------------------------------------
00072
00073
00074
00075
00076 function
fetchUser( $uid )
00077 {
00078 die (
"fetchUser not implemented" );
00079
00080 global
$FS,
$FS2,
$FS3,
$wgRootDirectory;
00081
00082 $fname =
$wgRootDirectory .
"/page/" .
$title;
00083
if( !file_exists( $fname ) )
return false;
00084
00085 $data =
splitHash( implode(
"", file( $fname ) ) );
00086
# enough?
00087
00088
return $data;
00089 }
00090
00091 function
useModFilename( $title ) {
00092 $c = substr( $
title, 0, 1 );
00093
if(preg_match( '/[A-Z]/', $c ) ) {
00094
return "$c/$title";
00095 }
00096
return "other/$title";
00097 }
00098
00099 function
fetchPage( $title )
00100 {
00101 global
$FS,
$FS1,
$FS2,
$FS3,
$wgRootDirectory;
00102
00103 $fname =
$wgRootDirectory .
"/page/" .
useModFilename( $
title ) .
".db";
00104
if( !file_exists( $fname ) ) {
00105 die(
"Couldn't open file '$fname' for page '$title'.\n" );
00106 }
00107
00108
$page =
splitHash( $FS1, file_get_contents( $fname ) );
00109 $section =
splitHash( $FS2, $
page[
"text_default"] );
00110 $text =
splitHash( $FS3, $section[
"data"] );
00111
00112
return array2object( array(
"text" => $text[
"text"] ,
"summary" => $text[
"summary"] ,
00113
"minor" => $text[
"minor"] ,
"ts" => $section[
"ts"] ,
00114
"username" => $section[
"username"] ,
"host" => $section[
"host"] ) );
00115 }
00116
00117 function
fetchKeptPages( $title )
00118 {
00119 global
$FS,
$FS1,
$FS2,
$FS3,
$wgRootDirectory,
$wgTimezoneCorrection;
00120
00121 $fname =
$wgRootDirectory .
"/keep/" .
useModFilename( $
title ) .
".kp";
00122
if( !file_exists( $fname ) )
return array();
00123
00124 $keptlist = explode( $FS1, file_get_contents( $fname ) );
00125 array_shift( $keptlist ); # Drop the junk at beginning of file
00126
00127 $revisions = array();
00128 foreach( $keptlist as $rev ) {
00129 $section =
splitHash( $FS2, $rev );
00130 $text =
splitHash( $FS3, $section[
"data"] );
00131
if ( $text[
"text"] && $text[
"minor"] !=
"" && ( $section[
"ts"]*1 > 0 ) ) {
00132 array_push( $revisions,
array2object( array (
"text" => $text[
"text"] ,
"summary" => $text[
"summary"] ,
00133
"minor" => $text[
"minor"] ,
"ts" => $section[
"ts"] ,
00134
"username" => $section[
"username"] ,
"host" => $section[
"host"] ) ) );
00135 }
else {
00136 echo
"-- skipped a bad old revision\n";
00137 }
00138 }
00139
return $revisions;
00140 }
00141
00142 function
splitHash ( $sep , $str ) {
00143 $temp = explode ( $sep , $str ) ;
00144 $ret = array () ;
00145
for (
$i = 0;
$i+1 < count ( $temp ) ;
$i++ ) {
00146 $ret[$temp[
$i]] = $temp[++
$i] ;
00147 }
00148
return $ret ;
00149 }
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162 function
importUser( $uid )
00163 {
00164 global $last_uid, $user_list, $wgTimestampCorrection;
00165 die(
"importUser NYI");
00166
return "";
00167
00168 $stuff =
fetchUser( $uid );
00169 $last_uid++;
00170
00171 $name =
wfStrencode( $stuff->username );
00172 $hash = md5hash( $stuff->password ); # Doable?
00173 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
00174 $hideminor = ($stuff['rcall'] ? 0 : 1);
00175
$options =
"cols={$stuff['editcols']}
00176
rows={$stuff['editrows']}
00177
rcdays={$stuff['rcdays']}
00178
timecorrection={$tzoffset}
00179
hideminor={$hideminor}
00180
";
00181
00182
$sql =
"INSERT
00183
INTO user (user_id,user_name,user_password,user_options)
00184
VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
00185
return $sql;
00186 }
00187
00188 function
checkUserCache( $name, $host )
00189 {
00190 global
$usercache;
00191
00192
if( $name ) {
00193
if( in_array( $
name, $usercache ) ) {
00194 $userid =
$usercache[$name];
00195 }
else {
00196
# If we haven't imported user accounts
00197
$userid = 0;
00198 }
00199 $username =
wfStrencode( $
name );
00200 }
else {
00201 $userid = 0;
00202 $username =
wfStrencode( $host );
00203 }
00204
return array( $userid, $username );
00205 }
00206
00207 function
importPage( $title )
00208 {
00209 global
$usercache;
00210 global
$conversiontime;
00211
00212 echo
"\n-- Importing page $title\n";
00213
$page =
fetchPage( $
title );
00214
00215 $newtitle =
wfStrencode(
recodeText( $
title ) );
00216 $namespace = 0;
00217
00218
# Current revision:
00219
$text =
wfStrencode(
recodeText( $
page->text ) );
00220
$comment =
wfStrencode(
recodeText( $
page->summary ) );
00221 $minor = (
$page->minor ? 1 : 0);
00222 list( $userid, $username ) =
checkUserCache( $
page->username, $
page->host );
00223 $username =
wfStrencode(
recodeText( $username ) );
00224 $timestamp =
wfUnix2Timestamp( $
page->ts );
00225 $redirect = ( preg_match( '/^#REDIRECT/', $
page->text ) ? 1 : 0 );
00226 $random = mt_rand() / mt_getrandmax();
00227 $inverse =
wfInvertTimestamp( $timestamp );
00228
$sql =
"
00229
INSERT
00230
INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
00231
($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
00232
00233
# History
00234
$revisions =
fetchKeptPages( $
title );
00235
if(count( $revisions ) == 0 ) {
00236
return $sql;
00237 }
00238
00239
$any =
false;
00240
$sql .=
"INSERT
00241
INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
00242 foreach( $revisions as $rev ) {
00243 $text =
wfStrencode(
recodeText( $rev->text ) );
00244 $minor = ($rev->minor ? 1 : 0);
00245 list( $userid, $username ) =
checkUserCache( $rev->username, $rev->host );
00246 $username =
wfStrencode(
recodeText( $username ) );
00247 $timestamp =
wfUnix2Timestamp( $rev->ts );
00248 $inverse =
wfInvertTimestamp( $timestamp );
00249
$comment =
wfStrencode(
recodeText( $rev->summary ) );
00250
00251
if(
$any)
$sql .=
",";
00252
$sql .=
"\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
00253
$any =
true;
00254 }
00255
$sql .=
";\n\n";
00256
return $sql;
00257 }
00258
00259
# Whee!
00260 function
recodeText( $string ) {
00261 global
$wgImportEncoding;
00262
# For currently latin-1 wikis
00263
$string = str_replace(
"\r\n",
"\n", $string );
00264
$string = iconv( $wgImportEncoding,
"UTF-8", $string );
00265
$string =
wfMungeToUtf8( $string ); # Any old Ӓ stuff
00266
return $string;
00267 }
00268
00269 function
wfUtf8Sequence($codepoint) {
00270
if($codepoint < 0x80)
return chr($codepoint);
00271
if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
00272 chr($codepoint & 0x3f | 0x80);
00273
if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
00274 chr($codepoint >> 6 & 0x3f | 0x80) .
00275 chr($codepoint & 0x3f | 0x80);
00276
if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-
check this
00277 chr($codepoint >> 12 & 0x3f | 0x80) .
00278 chr($codepoint >> 6 & 0x3f | 0x80) .
00279 chr($codepoint & 0x3f | 0x80);
00280
# Doesn't yet handle outside the BMP
00281
return "&#$codepoint;";
00282 }
00283
00284 function
wfMungeToUtf8($string) {
00285
$string = preg_replace ( '/&#([0-9]+);/e', '
wfUtf8Sequence($1)', $string );
00286
$string = preg_replace ( '/&#x([0-9a-f]+);/ie', '
wfUtf8Sequence(0x$1)', $string );
00287
# Should also do named entities here
00288
return $string;
00289 }
00290
00291 function
wfStrencode( $string ) {
00292
return mysql_escape_string( $string );
00293 }
00294
00295 function
wfUnix2Timestamp( $unixtime ) {
00296
return gmdate(
"YmdHis", $unixtime );
00297 }
00298
00299 function
wfTimestamp2Unix( $ts )
00300 {
00301
return gmmktime( ( (
int)substr( $ts, 8, 2) ),
00302 (
int)substr( $ts, 10, 2 ), (
int)substr( $ts, 12, 2 ),
00303 (
int)substr( $ts, 4, 2 ), (
int)substr( $ts, 6, 2 ),
00304 (
int)substr( $ts, 0, 4 ) );
00305 }
00306
00307 function
wfTimestampNow() {
00308
# return NOW
00309
return gmdate(
"YmdHis" );
00310 }
00311
00312
# Sorting hack for MySQL 3, which doesn't use index sorts for DESC
00313 function
wfInvertTimestamp( $ts ) {
00314
return strtr(
00315 $ts,
00316
"0123456789",
00317
"9876543210"
00318 );
00319 }
00320
00321 function
wfSeedRandom()
00322 {
00323 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
00324 mt_srand( $seed );
00325
$wgRandomSeeded =
true;
00326 }
00327
00328 function
array2object( $arr ) {
00329 $o = (object)0;
00330 foreach( $arr as $x => $y ) {
00331 $o->$x = $y;
00332 }
00333
return $o;
00334 }
00335
00336 ?>