Mediawiki: Tokenizer.php Source File

00001 <?php 00002 class Tokenizer { 00003 /* private */ var $mText, # Text to be processed by the tokenizer 00004 $mPos, # current position of tokenizer in text 00005 $mTextLength, # Length of $mText 00006 $mQueuedToken; # Tokens that were already found, but not 00007 # returned yet. 00008 00009 /* private */ function Tokenizer() 00010 { 00011 global $wgLang; 00012 00013 $this->mPos=0; 00014 $this->mTokenQueue=array(); 00015 $this->linkPrefixExtension = $wgLang->linkPrefixExtension(); 00016 } 00017 00018 # factory function 00019 function newFromString( $s ) 00020 { 00021 $fname = 'Tokenizer::newFromString'; 00022 wfProfileIn( $fname ); 00023 00024 $t = new Tokenizer(); 00025 $t->mText = $s; 00026 $t->mTextLength = strlen( $s ); 00027 00028 wfProfileOut( $fname ); 00029 return $t; 00030 } 00031 00032 00033 // Return the next token, but do not increase the pointer. The next call 00034 // to previewToken or nextToken will return the same token again. 00035 // Actually, the pointer is increased, but the token is queued. The next 00036 // call to previewToken or nextToken will check the queue and return 00037 // the stored token. 00038 function previewToken() 00039 { 00040 $fname = 'Tokenizer::previewToken'; 00041 wfProfileIn( $fname ); 00042 00043 if ( count( $this->mQueuedToken ) != 0 ) { 00044 // still one token from the last round around. Return that one first. 00045 $token = $this->mQueuedToken[0]; 00046 } else { 00047 $token = $this->nextToken(); 00048 array_unshift( $this->mQueuedToken, $token ); 00049 } 00050 00051 wfProfileOut( $fname ); 00052 return $token; 00053 } 00054 00055 00056 // get the next token 00057 // proceeds character by character through the text, looking for characters needing 00058 // special attention. Those are currently: I, R, ', [, ], newline 00059 // 00060 // TODO: handling of French blanks not yet implemented 00061 function nextToken() 00062 { 00063 $fname = 'Tokenizer::nextToken'; 00064 wfProfileIn( $fname ); 00065 00066 if ( count( $this->mQueuedToken ) != 0 ) { 00067 // still one token from the last round around. Return that one first. 00068 $token = array_shift( $this->mQueuedToken ); 00069 } else if ( $this->mPos > $this->mTextLength ) { 00070 // If no text is left, return "false". 00071 $token = false; 00072 } else { 00073 00074 $token['text']=''; 00075 $token['type']='text'; 00076 00077 while ( $this->mPos <= $this->mTextLength ) { 00078 switch ( @$ch = $this->mText[$this->mPos] ) { 00079 case 'R': // for "RFC " 00080 if ( $this->continues('FC ') ) { 00081 $queueToken['type'] = $queueToken['text'] = 'RFC '; 00082 $this->mQueuedToken[] = $queueToken; 00083 $this->mPos += 3; 00084 break 2; // switch + while 00085 } 00086 break; 00087 case 'I': // for "ISBN " 00088 if ( $this->continues('SBN ') ) { 00089 $queueToken['type'] = $queueToken['text'] = 'ISBN '; 00090 $this->mQueuedToken[] = $queueToken; 00091 $this->mPos += 4; 00092 break 2; // switch + while 00093 } 00094 break; 00095 case '[': // for links "[[" 00096 if ( $this->continues('[[') ) { 00097 $queueToken['type'] = '[[['; 00098 $queueToken['text'] = ''; 00099 $this->mQueuedToken[] = $queueToken; 00100 $this->mPos += 3; 00101 break 2; // switch + while 00102 } else if ( $this->continues('[') ) { 00103 $queueToken['type'] = '[['; 00104 $queueToken['text'] = ''; 00105 // Check for a "prefixed link", e.g. Al[[Khazar]] 00106 // Mostly for arabic wikipedia 00107 if ( $this->linkPrefixExtension ) { 00108 while ( $this->linkPrefixExtension 00109 && ($len = strlen( $token['text'] ) ) > 0 00110 && !ctype_space( $token['text'][$len-1] ) ) 00111 { 00112 //prepend the character to the link's open tag 00113 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text']; 00114 //remove character from the end of the text token 00115 $token['text'] = substr( $token['text'], 0, -1); 00116 } 00117 } 00118 $this->mQueuedToken[] = $queueToken; 00119 $this->mPos += 2; 00120 break 2; // switch + while 00121 } 00122 break; 00123 case ']': // for end of links "]]" 00124 if ( $this->continues(']') ) { 00125 $queueToken['type'] = ']]'; 00126 $queueToken['text'] = ''; 00127 $this->mQueuedToken[] = $queueToken; 00128 $this->mPos += 2; 00129 break 2; // switch + while 00130 } 00131 break; 00132 case "'": // for all kind of em's and strong's 00133 if ( $this->continues("'") ) { 00134 $queueToken['type'] = "'"; 00135 $queueToken['text'] = ''; 00136 while( ($this->mPos+1 < $this->mTextLength) 00137 && $this->mText[$this->mPos+1] == "'" ) 00138 { 00139 $queueToken['type'] .= "'"; 00140 $queueToken['pos'] = $this->mPos; 00141 $this->mPos ++; 00142 } 00143 00144 $this->mQueuedToken[] = $queueToken; 00145 $this->mPos ++; 00146 break 2; // switch + while 00147 } 00148 break; 00149 case "\n": // for block levels, actually, only "----" is handled. 00150 case "\r": // headings are detected to close any unbalanced em or strong tags in a section 00151 if ( $this->continues( '----' ) ) 00152 { 00153 $queueToken['type'] = '----'; 00154 $queueToken['text'] = ''; 00155 $this->mQueuedToken[] = $queueToken; 00156 $this->mPos += 5; 00157 while ( $this->mPos<$this->mTextLength 00158 and $this->mText[$this->mPos] == '-' ) 00159 { 00160 $this->mPos ++; 00161 } 00162 break 2; 00163 } else if ( 00164 $this->continues( '<h' ) and ( 00165 $this->continues( '<h1' ) or 00166 $this->continues( '<h2' ) or 00167 $this->continues( '<h3' ) or 00168 $this->continues( '<h4' ) or 00169 $this->continues( '<h5' ) or 00170 $this->continues( '<h6' ) 00171 ) 00172 ) { // heading 00173 $queueToken['type'] = 'h'; 00174 $queueToken['text'] = ''; 00175 $this->mQueuedToken[] = $queueToken; 00176 $this->mPos ++; 00177 break 2; // switch + while 00178 } 00179 break; 00180 case '!': // French spacing rules have a space before exclamation 00181 case '?': // and question marks. Those have to become   00182 case ':': // And colons, Hashar says ... 00183 if ( $this->preceeded( ' ' ) ) 00184 { 00185 // strip blank from Token 00186 $token['text'] = substr( $token['text'], 0, -1 ); 00187 $queueToken['type'] = 'blank'; 00188 $queueToken['text'] = " {$ch}"; 00189 $this->mQueuedToken[] = $queueToken; 00190 $this->mPos ++; 00191 break 2; // switch + while 00192 } 00193 break; 00194 case '0': // A space between two numbers is used to ease reading 00195 case '1': // of big numbers, e.g. 1 000 000. Those spaces need 00196 case '2': // to be unbreakable 00197 case '3': 00198 case '4': 00199 case '5': 00200 case '6': 00201 case '7': 00202 case '8': 00203 case '9': 00204 if ( ($this->mTextLength >= $this->mPos +2) 00205 && ($this->mText[$this->mPos+1] == " ") 00206 && ctype_digit( $this->mText[$this->mPos+2] ) ) 00207 { 00208 $queueToken['type'] = 'blank'; 00209 $queueToken['text'] = $ch . ' '; 00210 $this->mQueuedToken[] = $queueToken; 00211 $this->mPos += 2; 00212 break 2; // switch + while 00213 } 00214 break; 00215 case "\302": // first byte of UTF-8 Character Guillemet-left 00216 if ( $this->continues( "\253 ") ) // second byte and a blank 00217 { 00218 $queueToken['type'] = 'blank'; 00219 $queueToken['text'] = "\302\253 "; 00220 $this->mQueuedToken[] = $queueToken; 00221 $this->mPos += 3; 00222 break 2; // switch + while 00223 } 00224 break; 00225 case "\273": //last byte of UTF-8 Character Guillemet-right 00226 if ( $this->preceeded( " \302" ) ) 00227 { 00228 $queueToken['type'] = 'blank'; 00229 $queueToken['text'] = " \302\273"; 00230 $token['text'] = substr( $token['text'], 0, -2 ); 00231 $this->mQueuedToken[] = $queueToken; 00232 $this->mPos ++; 00233 break 2; // switch + while 00234 } 00235 break; 00236 case '&': //extensions like <timeline>, since HTML stripping has already been done, 00237 //those look like <timeline> 00238 if ( $this->continues( "lt;timeline>" ) ) 00239 { 00240 $queueToken['type'] = "<timeline>"; 00241 $queueToken['text'] = "<timeline>"; 00242 $this->mQueuedToken[] = $queueToken; 00243 $this->mPos += 16; 00244 break 2; // switch + while 00245 } 00246 break; 00247 00248 } /* switch */ 00249 $token['text'].=$ch; 00250 $this->mPos ++; 00251 // echo $this->mPos . "<br>\n"; 00252 } /* while */ 00253 } /* if (nothing left in queue) */ 00254 00255 wfProfileOut( $fname ); 00256 return $token; 00257 } 00258 00259 // function continues 00260 // checks whether the mText continues with $cont from mPos+1 00261 /* private */ function continues( $cont ) 00262 { 00263 // If string is not long enough to contain $cont, return false 00264 if ( $this->mTextLength < $this->mPos + strlen( $cont ) ) 00265 return false; 00266 for ( $i=0; $i < strlen( $cont ); $i++ ) 00267 { 00268 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] ) 00269 return false; 00270 } 00271 return true; 00272 } 00273 00274 // function preceeded 00275 // checks whether the mText is preceeded by $prec at position mPos 00276 /* private */ function preceeded( $prec ) 00277 { 00278 $len = strlen( $prec ); 00279 // if $prec is longer than the text up to mPos, return false 00280 if ( $this->mPos < $len ) 00281 return false; 00282 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) ); 00283 } 00284 00285 function readAllUntil( $border ) 00286 { 00287 $n = strpos( $this->mText, $border, $this->mPos ); 00288 if ( $n === false ) 00289 return ''; 00290 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos ); 00291 $this->mPos = $n + strlen( $border ) + 1; 00292 return $ret; 00293 } 00294 00295 }