Drupal HEAD(D7) Code Coverage - includes/unicode.inc

1 <?php
2 // $Id: unicode.inc,v 1.37 2009/01/02 22:09:53 dries Exp $
3
4 /**
5 * Indicates an error during check for PHP unicode support.
6 */
7 define('UNICODE_ERROR', -1);
8
9 /**
10 * Indicates that standard PHP (emulated) unicode support is being used.
11 */
12 define('UNICODE_SINGLEBYTE', 0);
13
14 /**
15 * Indicates that full unicode support with the PHP mbstring extension is being
16 * used.
17 */
18 define('UNICODE_MULTIBYTE', 1);
19
20 /**
21 * Wrapper around _unicode_check().
22 */
23 function unicode_check() {
24 list($GLOBALS['multibyte']) = _unicode_check();
25 }
26
27 /**
28 * Perform checks about Unicode support in PHP, and set the right settings if
29 * needed.
30 *
31 * Because Drupal needs to be able to handle text in various encodings, we do
32 * not support mbstring function overloading. HTTP input/output conversion must
33 * be disabled for similar reasons.
34 *
35 * @param $errors
36 * Whether to report any fatal errors with form_set_error().
37 */
38 function _unicode_check() {
39 // Ensure translations don't break at install time
40 $t = get_t();
41
42 // Set the standard C locale to ensure consistent, ASCII-only string handling.
43 setlocale(LC_CTYPE, 'C');
44
45 // Check for mbstring extension
46 if (!function_exists('mb_strlen')) {
47 return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));
48 }
49
50 // Check mbstring configuration
51 if (ini_get('mbstring.func_overload') != 0) {
52 return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
53 }
54 if (ini_get('mbstring.encoding_translation') != 0) {
55 return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
56 }
57 if (ini_get('mbstring.http_input') != 'pass') {
58 return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
59 }
60 if (ini_get('mbstring.http_output') != 'pass') {
61 return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
62 }
63
64 // Set appropriate configuration
65 mb_internal_encoding('utf-8');
66 mb_language('uni');
67 return array(UNICODE_MULTIBYTE, '');
68 }
69
70 /**
71 * Return Unicode library status and errors.
72 */
73 function unicode_requirements() {
74 // Ensure translations don't break at install time
75 $t = get_t();
76
77 $libraries = array(
78 UNICODE_SINGLEBYTE => $t('Standard PHP'),
79 UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
80 UNICODE_ERROR => $t('Error'),
81 );
82 $severities = array(
83 UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
84 UNICODE_MULTIBYTE => REQUIREMENT_OK,
85 UNICODE_ERROR => REQUIREMENT_ERROR,
86 );
87 list($library, $description) = _unicode_check();
88
89 $requirements['unicode'] = array(
90 'title' => $t('Unicode library'),
91 'value' => $libraries[$library],
92 );
93 if ($description) {
94 $requirements['unicode']['description'] = $description;
95 }
96
97 $requirements['unicode']['severity'] = $severities[$library];
98
99 return $requirements;
100 }
101
102 /**
103 * Prepare a new XML parser.
104 *
105 * This is a wrapper around xml_parser_create() which extracts the encoding from
106 * the XML data first and sets the output encoding to UTF-8. This function should
107 * be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
108 * check the input encoding itself. "Starting from PHP 5, the input encoding is
109 * automatically detected, so that the encoding parameter specifies only the
110 * output encoding."
111 *
112 * This is also where unsupported encodings will be converted. Callers should
113 * take this into account: $data might have been changed after the call.
114 *
115 * @param &$data
116 * The XML data which will be parsed later.
117 * @return
118 * An XML parser object or FALSE on error.
119 */
120 function drupal_xml_parser_create(&$data) {
121 // Default XML encoding is UTF-8
122 $encoding = 'utf-8';
123 $bom = FALSE;
124
125 // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
126 if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
127 $bom = TRUE;
128 $data = substr($data, 3);
129 }
130
131 // Check for an encoding declaration in the XML prolog if no BOM was found.
132 if (!$bom && preg_match('/^<\?xml[^>]+encoding="(.+?)"/', $data, $match)) {
133 $encoding = $match[1];
134 }
135
136 // Unsupported encodings are converted here into UTF-8.
137 $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
138 if (!in_array(strtolower($encoding), $php_supported)) {
139 $out = drupal_convert_to_utf8($data, $encoding);
140 if ($out !== FALSE) {
141 $encoding = 'utf-8';
142 $data = preg_replace('/^(<\?xml[^>]+encoding)="(.+?)"/', '\\1="utf-8"', $out);
143 }
144 else {
145 watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING);
146 return FALSE;
147 }
148 }
149
150 $xml_parser = xml_parser_create($encoding);
151 xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
152 return $xml_parser;
153 }
154
155 /**
156 * Convert data to UTF-8
157 *
158 * Requires the iconv, GNU recode or mbstring PHP extension.
159 *
160 * @param $data
161 * The data to be converted.
162 * @param $encoding
163 * The encoding that the data is in
164 * @return
165 * Converted data or FALSE.
166 */
167 function drupal_convert_to_utf8($data, $encoding) {
168 if (function_exists('iconv')) {
169 $out = @iconv($encoding, 'utf-8', $data);
170 }
171 elseif (function_exists('mb_convert_encoding')) {
172 $out = @mb_convert_encoding($data, 'utf-8', $encoding);
173 }
174 elseif (function_exists('recode_string')) {
175 $out = @recode_string($encoding . '..utf-8', $data);
176 }
177 else {
178 watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
179 return FALSE;
180 }
181
182 return $out;
183 }
184
185 /**
186 * Truncate a UTF-8-encoded string safely to a number of bytes.
187 *
188 * If the end position is in the middle of a UTF-8 sequence, it scans backwards
189 * until the beginning of the byte sequence.
190 *
191 * Use this function whenever you want to chop off a string at an unsure
192 * location. On the other hand, if you're sure that you're splitting on a
193 * character boundary (e.g. after using strpos() or similar), you can safely use
194 * substr() instead.
195 *
196 * @param $string
197 * The string to truncate.
198 * @param $len
199 * An upper limit on the returned string length.
200 * @return
201 * The truncated string.
202 */
203 function drupal_truncate_bytes($string, $len) {
204 if (strlen($string) <= $len) {
205 return $string;
206 }
207 if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
208 return substr($string, 0, $len);
209 }
210 while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0) {};
211 return substr($string, 0, $len);
212 }
213
214 /**
215 * Truncate a UTF-8-encoded string safely to a number of characters.
216 *
217 * @param $string
218 * The string to truncate.
219 * @param $len
220 * An upper limit on the returned string length.
221 * @param $wordsafe
222 * Flag to truncate at last space within the upper limit. Defaults to FALSE.
223 * @param $dots
224 * Flag to add trailing dots. Defaults to FALSE.
225 * @return
226 * The truncated string.
227 */
228 function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
229
230 if (drupal_strlen($string) <= $len) {
231 return $string;
232 }
233
234 if ($dots) {
235 $len -= 4;
236 }
237
238 if ($wordsafe) {
239 $string = drupal_substr($string, 0, $len + 1); // leave one more character
240 if ($last_space = strrpos($string, ' ')) { // space exists AND is not on position 0
241 $string = substr($string, 0, $last_space);
242 }
243 else {
244 $string = drupal_substr($string, 0, $len);
245 }
246 }
247 else {
248 $string = drupal_substr($string, 0, $len);
249 }
250
251 if ($dots) {
252 $string .= ' ...';
253 }
254
255 return $string;
256 }
257
258 /**
259 * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
260 * characters.
261 *
262 * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".
263 *
264 * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
265 *
266 * Notes:
267 * - Only encode strings that contain non-ASCII characters.
268 * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure
269 * each chunk starts and ends on a character boundary.
270 * - Using \n as the chunk separator may cause problems on some systems and may
271 * have to be changed to \r\n or \r.
272 */
273 function mime_header_encode($string) {
274 if (preg_match('/[^\x20-\x7E]/', $string)) {
275 $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
276 $len = strlen($string);
277 $output = '';
278 while ($len > 0) {
279 $chunk = drupal_truncate_bytes($string, $chunk_size);
280 $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n";
281 $c = strlen($chunk);
282 $string = substr($string, $c);
283 $len -= $c;
284 }
285 return trim($output);
286 }
287 return $string;
288 }
289
290 /**
291 * Complement to mime_header_encode
292 */
293 function mime_header_decode($header) {
294 // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)
295 $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);
296 // Second step: remaining chunks (do not collapse whitespace)
297 return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header);
298 }
299
300 /**
301 * Helper function to mime_header_decode
302 */
303 function _mime_header_decode($matches) {
304 // Regexp groups:
305 // 1: Character set name
306 // 2: Escaping method (Q or B)
307 // 3: Encoded data
308 $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));
309 if (strtolower($matches[1]) != 'utf-8') {
310 $data = drupal_convert_to_utf8($data, $matches[1]);
311 }
312 return $data;
313 }
314
315 /**
316 * Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.
317 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;", not "<").
318 *
319 * @param $text
320 * The text to decode entities in.
321 * @param $exclude
322 * An array of characters which should not be decoded. For example,
323 * array('<', '&', '"'). This affects both named and numerical entities.
324 */
325 function decode_entities($text, $exclude = array()) {
326 static $html_entities;
327 if (!isset($html_entities)) {
328 include DRUPAL_ROOT . '/includes/unicode.entities.inc';
329 }
330
331 // Flip the exclude list so that we can do quick lookups later.
332 $exclude = array_flip($exclude);
333
334 // Use a regexp to select all entities in one pass, to avoid decoding
335 // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
336 // being used to allow for a callback (see
337 // http://php.net/manual/en/reference.pcre.pattern.modifiers).
338 return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
339 }
340
341 /**
342 * Helper function for decode_entities
343 */
344 function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
345 // Named entity
346 if (!$prefix) {
347 // A named entity not in the exclude list.
348 if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
349 return $html_entities[$original];
350 }
351 else {
352 return $original;
353 }
354 }
355 // Hexadecimal numerical entity
356 if ($prefix == '#x') {
357 $codepoint = base_convert($codepoint, 16, 10);
358 }
359 // Decimal numerical entity (strip leading zeros to avoid PHP octal notation)
360 else {
361 $codepoint = preg_replace('/^0+/', '', $codepoint);
362 }
363 // Encode codepoint as UTF-8 bytes
364 if ($codepoint < 0x80) {
365 $str = chr($codepoint);
366 }
367 elseif ($codepoint < 0x800) {
368 $str = chr(0xC0 | ($codepoint >> 6))
369 . chr(0x80 | ($codepoint & 0x3F));
370 }
371 elseif ($codepoint < 0x10000) {
372 $str = chr(0xE0 | ( $codepoint >> 12))
373 . chr(0x80 | (($codepoint >> 6) & 0x3F))
374 . chr(0x80 | ( $codepoint & 0x3F));
375 }
376 elseif ($codepoint < 0x200000) {
377 $str = chr(0xF0 | ( $codepoint >> 18))
378 . chr(0x80 | (($codepoint >> 12) & 0x3F))
379 . chr(0x80 | (($codepoint >> 6) & 0x3F))
380 . chr(0x80 | ( $codepoint & 0x3F));
381 }
382 // Check for excluded characters
383 if (isset($exclude[$str])) {
384 return $original;
385 }
386 else {
387 return $str;
388 }
389 }
390
391 /**
392 * Count the amount of characters in a UTF-8 string. This is less than or
393 * equal to the byte count.
394 */
395 function drupal_strlen($text) {
396 global $multibyte;
397 if ($multibyte == UNICODE_MULTIBYTE) {
398 return mb_strlen($text);
399 }
400 else {
401 // Do not count UTF-8 continuation bytes.
402 return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
403 }
404 }
405
406 /**
407 * Uppercase a UTF-8 string.
408 */
409 function drupal_strtoupper($text) {
410 global $multibyte;
411 if ($multibyte == UNICODE_MULTIBYTE) {
412 return mb_strtoupper($text);
413 }
414 else {
415 // Use C-locale for ASCII-only uppercase
416 $text = strtoupper($text);
417 // Case flip Latin-1 accented letters
418 $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);
419 return $text;
420 }
421 }
422
423 /**
424 * Lowercase a UTF-8 string.
425 */
426 function drupal_strtolower($text) {
427 global $multibyte;
428 if ($multibyte == UNICODE_MULTIBYTE) {
429 return mb_strtolower($text);
430 }
431 else {
432 // Use C-locale for ASCII-only lowercase
433 $text = strtolower($text);
434 // Case flip Latin-1 accented letters
435 $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);
436 return $text;
437 }
438 }
439
440 /**
441 * Helper function for case conversion of Latin-1.
442 * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
443 */
444 function _unicode_caseflip($matches) {
445 return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
446 }
447
448 /**
449 * Capitalize the first letter of a UTF-8 string.
450 */
451 function drupal_ucfirst($text) {
452 // Note: no mbstring equivalent!
453 return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);
454 }
455
456 /**
457 * Cut off a piece of a string based on character indices and counts. Follows
458 * the same behavior as PHP's own substr() function.
459 *
460 * Note that for cutting off a string at a known character/substring
461 * location, the usage of PHP's normal strpos/substr is safe and
462 * much faster.
463 */
464 function drupal_substr($text, $start, $length = NULL) {
465 global $multibyte;
466 if ($multibyte == UNICODE_MULTIBYTE) {
467 return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);
468 }
469 else {
470 $strlen = strlen($text);
471 // Find the starting byte offset.
472 $bytes = 0;
473 if ($start > 0) {
474 // Count all the continuation bytes from the start until we have found
475 // $start characters or the end of the string.
476 $bytes = -1; $chars = -1;
477 while ($bytes < $strlen - 1 && $chars < $start) {
478 $bytes++;
479 $c = ord($text[$bytes]);
480 if ($c < 0x80 || $c >= 0xC0) {
481 $chars++;
482 }
483 }
484 }
485 elseif ($start < 0) {
486 // Count all the continuation bytes from the end until we have found
487 // abs($start) characters.
488 $start = abs($start);
489 $bytes = $strlen; $chars = 0;
490 while ($bytes > 0 && $chars < $start) {
491 $bytes--;
492 $c = ord($text[$bytes]);
493 if ($c < 0x80 || $c >= 0xC0) {
494 $chars++;
495 }
496 }
497 }
498 $istart = $bytes;
499
500 // Find the ending byte offset.
501 if ($length === NULL) {
502 $iend = $strlen;
503 }
504 elseif ($length > 0) {
505 // Count all the continuation bytes from the starting index until we have
506 // found $length characters or reached the end of the string, then
507 // backtrace one byte.
508 $iend = $istart - 1; $chars = -1;
509 while ($iend < $strlen - 1 && $chars < $length) {
510 $iend++;
511 $c = ord($text[$iend]);
512 if ($c < 0x80 || $c >= 0xC0) {
513 $chars++;
514 }
515 }
516 // Backtrace one byte if the end of the string was not reached.
517 if ($iend < $strlen - 1) {
518 $iend--;
519 }
520 }
521 elseif ($length < 0) {
522 // Count all the continuation bytes from the end until we have found
523 // abs($start) characters, then backtrace one byte.
524 $length = abs($length);
525 $iend = $strlen; $chars = 0;
526 while ($iend > 0 && $chars < $length) {
527 $iend--;
528 $c = ord($text[$iend]);
529 if ($c < 0x80 || $c >= 0xC0) {
530 $chars++;
531 }
532 }
533 // Backtrace one byte if we are not at the begining of the string.
534 if ($iend > 0) {
535 $iend--;
536 }
537 }
538 else {
539 // $length == 0, return an empty string.
540 $iend = $istart - 1;
541 }
542
543 return substr($text, $istart, max(0, $iend - $istart + 1));
544 }
545 }
546
547
548

Legend

Missed
lines code that were not excersized during program execution.
Covered
lines code were excersized during program execution.
Comment/non executable
Comment or non-executable line of code.
Dead
lines of code that according to xdebug could not be executed. This is counted as coverage code because in almost all cases it is code that runnable.