00001 <?php
00002 class Tokenizer {
00003 var
$mText, # Text to be processed by the tokenizer
00004
$mPos, # current position of tokenizer in text
00005
$mTextLength, # Length of
$mText
00006
$mQueuedToken; # Tokens that were already found, but not
00007
# returned yet.
00008
00009 function
Tokenizer()
00010 {
00011 global
$wgLang;
00012
00013 $this->mPos=0;
00014 $this->mTokenQueue=array();
00015 $this->linkPrefixExtension =
$wgLang->linkPrefixExtension();
00016 }
00017
00018
# factory function
00019 function
newFromString( $s )
00020 {
00021 $fname = '
Tokenizer::newFromString';
00022
wfProfileIn( $fname );
00023
00024 $t =
new Tokenizer();
00025 $t->mText =
$s;
00026 $t->mTextLength = strlen( $s );
00027
00028
wfProfileOut( $fname );
00029
return $t;
00030 }
00031
00032
00033
00034
00035
00036
00037
00038 function
previewToken()
00039 {
00040 $fname = '
Tokenizer::previewToken';
00041
wfProfileIn( $fname );
00042
00043
if ( count( $this->mQueuedToken ) != 0 ) {
00044
00045 $token = $this->mQueuedToken[0];
00046 }
else {
00047 $token = $this->
nextToken();
00048 array_unshift( $this->mQueuedToken, $token );
00049 }
00050
00051
wfProfileOut( $fname );
00052
return $token;
00053 }
00054
00055
00056
00057
00058
00059
00060
00061 function
nextToken()
00062 {
00063 $fname = '
Tokenizer::nextToken';
00064
wfProfileIn( $fname );
00065
00066
if ( count( $this->mQueuedToken ) != 0 ) {
00067
00068 $token = array_shift( $this->mQueuedToken );
00069 }
else if ( $this->mPos > $this->mTextLength ) {
00070
00071 $token =
false;
00072 }
else {
00073
00074 $token['text']='';
00075 $token['type']='text';
00076
00077
while ( $this->mPos <= $this->mTextLength ) {
00078
switch ( @$ch = $this->mText[$this->mPos] ) {
00079
case 'R':
00080
if ( $this->
continues('FC ') ) {
00081 $queueToken['type'] = $queueToken['text'] = 'RFC ';
00082 $this->mQueuedToken[] = $queueToken;
00083 $this->mPos += 3;
00084
break 2;
00085 }
00086
break;
00087
case 'I':
00088
if ( $this->
continues('SBN ') ) {
00089 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
00090 $this->mQueuedToken[] = $queueToken;
00091 $this->mPos += 4;
00092
break 2;
00093 }
00094
break;
00095
case '[':
00096
if ( $this->
continues('[[') ) {
00097 $queueToken['type'] = '[[[';
00098 $queueToken['text'] = '';
00099 $this->mQueuedToken[] = $queueToken;
00100 $this->mPos += 3;
00101
break 2;
00102 }
else if ( $this->
continues(
'[') ) {
00103 $queueToken['type'] = '[[';
00104 $queueToken['text'] = '';
00105
00106
00107
if ( $this->linkPrefixExtension ) {
00108
while ( $this->linkPrefixExtension
00109 && ($len = strlen( $token['text'] ) ) > 0
00110 && !ctype_space( $token['text'][$len-1] ) )
00111 {
00112
00113 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
00114
00115 $token['text'] = substr( $token['text'], 0, -1);
00116 }
00117 }
00118 $this->mQueuedToken[] = $queueToken;
00119 $this->mPos += 2;
00120
break 2;
00121 }
00122
break;
00123
case ']':
00124
if ( $this->
continues(
']') ) {
00125 $queueToken['type'] = ']]';
00126 $queueToken['text'] = '';
00127 $this->mQueuedToken[] = $queueToken;
00128 $this->mPos += 2;
00129
break 2;
00130 }
00131
break;
00132
case "'":
00133
if ( $this->
continues(
"'") ) {
00134 $queueToken['type'] =
"'";
00135 $queueToken['text'] = '';
00136
while( ($this->mPos+1 < $this->mTextLength)
00137 && $this->mText[$this->mPos+1] ==
"'" )
00138 {
00139 $queueToken['type'] .=
"'";
00140 $queueToken['pos'] = $this->mPos;
00141 $this->mPos ++;
00142 }
00143
00144 $this->mQueuedToken[] = $queueToken;
00145 $this->mPos ++;
00146
break 2;
00147 }
00148
break;
00149
case "\n":
00150
case "\r":
00151
if ( $this->
continues( '----' ) )
00152 {
00153 $queueToken['type'] = '----';
00154 $queueToken['text'] = '';
00155 $this->mQueuedToken[] = $queueToken;
00156 $this->mPos += 5;
00157
while ( $this->mPos<$this->mTextLength
00158 and $this->mText[$this->mPos] ==
'-' )
00159 {
00160 $this->mPos ++;
00161 }
00162
break 2;
00163 }
else if (
00164 $this->
continues( '<h' ) and (
00165 $this->continues( '<h1' ) or
00166 $this->
continues( '<h2' ) or
00167 $this->continues( '<h3' ) or
00168 $this->continues( '<h4' ) or
00169 $this->continues( '<h5' ) or
00170 $this->continues( '<h6' )
00171 )
00172 ) {
00173 $queueToken['type'] =
'h';
00174 $queueToken['text'] = '';
00175 $this->mQueuedToken[] = $queueToken;
00176 $this->mPos ++;
00177
break 2;
00178 }
00179
break;
00180
case '!':
00181
case '?':
00182
case ':':
00183
if ( $this->
preceeded(
' ' ) )
00184 {
00185
00186 $token['text'] = substr( $token['text'], 0, -1 );
00187 $queueToken['type'] = 'blank';
00188 $queueToken['text'] =
" {$ch}";
00189 $this->mQueuedToken[] = $queueToken;
00190 $this->mPos ++;
00191
break 2;
00192 }
00193
break;
00194
case '0':
00195
case '1':
00196
case '2':
00197
case '3':
00198
case '4':
00199
case '5':
00200
case '6':
00201
case '7':
00202
case '8':
00203
case '9':
00204
if ( ($this->mTextLength >= $this->mPos +2)
00205 && ($this->mText[$this->mPos+1] ==
" ")
00206 && ctype_digit( $this->mText[$this->mPos+2] ) )
00207 {
00208 $queueToken['type'] = 'blank';
00209 $queueToken['text'] = $ch .
' ';
00210 $this->mQueuedToken[] = $queueToken;
00211 $this->mPos += 2;
00212
break 2;
00213 }
00214
break;
00215
case "\302":
00216
if ( $this->continues(
"\253 ") )
00217 {
00218 $queueToken['type'] = 'blank';
00219 $queueToken['text'] =
"\302\253 ";
00220 $this->mQueuedToken[] = $queueToken;
00221 $this->mPos += 3;
00222
break 2;
00223 }
00224
break;
00225
case "\273":
00226
if ( $this->
preceeded(
" \302" ) )
00227 {
00228 $queueToken['type'] = 'blank';
00229 $queueToken['text'] =
" \302\273";
00230 $token['text'] = substr( $token['text'], 0, -2 );
00231 $this->mQueuedToken[] = $queueToken;
00232 $this->mPos ++;
00233
break 2;
00234 }
00235
break;
00236
case '&':
00237
00238
if ( $this->continues(
"lt;timeline>" ) )
00239 {
00240 $queueToken['type'] =
"<timeline>";
00241 $queueToken['text'] =
"<timeline>";
00242 $this->mQueuedToken[] = $queueToken;
00243 $this->mPos += 16;
00244
break 2;
00245 }
00246
break;
00247
00248 }
00249 $token['text'].=$ch;
00250 $this->mPos ++;
00251
00252 }
00253 }
00254
00255
wfProfileOut( $fname );
00256
return $token;
00257 }
00258
00259
00260
00261 function
continues( $cont )
00262 {
00263
00264
if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
00265
return false;
00266
for (
$i=0;
$i < strlen( $cont );
$i++ )
00267 {
00268
if ( $this->mText[$this->mPos+1+
$i] != $cont[
$i] )
00269
return false;
00270 }
00271
return true;
00272 }
00273
00274
00275
00276 function
preceeded( $prec )
00277 {
00278 $len = strlen( $prec );
00279
00280
if ( $this->mPos < $len )
00281
return false;
00282
return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
00283 }
00284
00285 function
readAllUntil( $border )
00286 {
00287
$n = strpos( $this->mText, $border, $this->mPos );
00288
if (
$n ===
false )
00289
return '';
00290 $ret = substr( $this->mText, $this->mPos, $
n - $this->mPos );
00291 $this->mPos =
$n + strlen( $border ) + 1;
00292
return $ret;
00293 }
00294
00295 }