root/trunk/system/core/utf8.php

Revision 2712, 19.2 kB (checked in by Geert, 1 month ago)

--

  • Property svn:eol-style set to LF
  • Property copyright set to Copyright (c) 2007 Kohana Team
  • Property svn:keywords set to Id
Line 
1 <?php defined('SYSPATH') or die('No direct script access.');
2 /**
3  * A port of phputf8 to a unified file/class. Checks PHP status to ensure that
4  * UTF-8 support is available and normalize global variables to UTF-8. It also
5  * provides multi-byte aware replacement string functions.
6  *
7  * This file is licensed differently from the rest of Kohana. As a port of
8  * phputf8, which is LGPL software, this file is released under the LGPL.
9  *
10  * PCRE needs to be compiled with UTF-8 support (--enable-utf8).
11  * Support for Unicode properties is highly recommended (--enable-unicode-properties).
12  * @see http://php.net/manual/reference.pcre.pattern.modifiers.php
13  *
14  * UTF-8 conversion will be much more reliable if the iconv extension is loaded.
15  * @see http://php.net/iconv
16  *
17  * The mbstring extension is highly recommended, but must not be overloading
18  * string functions.
19  * @see http://php.net/mbstring
20  *
21  * $Id$
22  *
23  * @package    Core
24  * @author     Kohana Team
25  * @copyright  (c) 2007 Kohana Team
26  * @copyright  (c) 2005 Harry Fuecks
27  * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
28  */
29
30 if ( ! preg_match('/^.$/u', 'ñ'))
31 {
32     trigger_error
33     (
34         '<a href="http://php.net/pcre">PCRE</a> has not been compiled with UTF-8 support. '.
35         'See <a href="http://php.net/manual/reference.pcre.pattern.modifiers.php">PCRE Pattern Modifiers</a> '.
36         'for more information. This application cannot be run without UTF-8 support.',
37         E_USER_ERROR
38     );
39 }
40
41 if ( ! extension_loaded('iconv'))
42 {
43     trigger_error
44     (
45         'The <a href="http://php.net/iconv">iconv</a> extension is not loaded. '.
46         'Without iconv, strings cannot be properly translated to UTF-8 from user input. '.
47         'This application cannot be run without UTF-8 support.',
48         E_USER_ERROR
49     );
50 }
51
52 if (extension_loaded('mbstring') AND (ini_get('mbstring.func_overload') & MB_OVERLOAD_STRING))
53 {
54     trigger_error
55     (
56         'The <a href="http://php.net/mbstring">mbstring</a> extension is overloading PHP\'s native string functions. '.
57         'Disable this by setting mbstring.func_overload to 0, 1, 4 or 5 in php.ini or a .htaccess file.'.
58         'This application cannot be run without UTF-8 support.',
59         E_USER_ERROR
60     );
61 }
62
63 // Check PCRE support for Unicode properties such as \p and \X.
64 $ER = error_reporting(0);
65 define('PCRE_UNICODE_PROPERTIES', (bool) preg_match('/^\pL$/u', 'ñ'));
66 error_reporting($ER);
67
68 // SERVER_UTF8 ? use mb_* functions : use non-native functions
69 if (extension_loaded('mbstring'))
70 {
71     mb_internal_encoding('UTF-8');
72     define('SERVER_UTF8', TRUE);
73 }
74 else
75 {
76     define('SERVER_UTF8', FALSE);
77 }
78
79 // Convert all global variables to UTF-8.
80 $_GET    = utf8::clean($_GET);
81 $_POST   = utf8::clean($_POST);
82 $_COOKIE = utf8::clean($_COOKIE);
83 $_SERVER = utf8::clean($_SERVER);
84
85 if (PHP_SAPI == 'cli')
86 {
87     // Convert command line arguments
88     $_SERVER['argv'] = utf8::clean($_SERVER['argv']);
89 }
90
91 final class utf8 {
92
93     // Called methods
94     static $called = array();
95
96     /**
97      * Recursively cleans arrays, objects, and strings. Removes ASCII control
98      * codes and converts to UTF-8 while silently discarding incompatible
99      * UTF-8 characters.
100      *
101      * @param   string  string to clean
102      * @return  string
103      */
104     public static function clean($str)
105     {
106         if (is_array($str) OR is_object($str))
107         {
108             foreach ($str as $key => $val)
109             {
110                 // Recursion!
111                 $str[self::clean($key)] = self::clean($val);
112             }
113         }
114         elseif (is_string($str) AND $str !== '')
115         {
116             // Remove control characters
117             $str = self::strip_ascii_ctrl($str);
118
119             if ( ! self::is_ascii($str))
120             {
121                 // Disable notices
122                 $ER = error_reporting(~E_NOTICE);
123
124                 // iconv is expensive, so it is only used when needed
125                 $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
126
127                 // Turn notices back on
128                 error_reporting($ER);
129             }
130         }
131
132         return $str;
133     }
134
135     /**
136      * Tests whether a string contains only 7bit ASCII bytes. This is used to
137      * determine when to use native functions or UTF-8 functions.
138      *
139      * @param   string  string to check
140      * @return  bool
141      */
142     public static function is_ascii($str)
143     {
144         return ! preg_match('/[^\x00-\x7F]/S', $str);
145     }
146
147     /**
148      * Strips out device control codes in the ASCII range.
149      *
150      * @param   string  string to clean
151      * @return  string
152      */
153     public static function strip_ascii_ctrl($str)
154     {
155         return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
156     }
157
158     /**
159      * Strips out all non-7bit ASCII bytes.
160      *
161      * @param   string  string to clean
162      * @return  string
163      */
164     public static function strip_non_ascii($str)
165     {
166         return preg_replace('/[^\x00-\x7F]+/S', '', $str);
167     }
168
169     /**
170      * Replaces special/accented UTF-8 characters by ASCII-7 'equivalents'.
171      *
172      * @author  Andreas Gohr <andi@splitbrain.org>
173      *
174      * @param   string   string to transliterate
175      * @param   integer  -1 lowercase only, +1 uppercase only, 0 both cases
176      * @return  string
177      */
178     public static function transliterate_to_ascii($str, $case = 0)
179     {
180         if ( ! isset(self::$called[__FUNCTION__]))
181         {
182             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
183
184             // Function has been called
185             self::$called[__FUNCTION__] = TRUE;
186         }
187
188         return _transliterate_to_ascii($str, $case);
189     }
190
191     /**
192      * Returns the length of the given string.
193      * @see http://php.net/strlen
194      *
195      * @param   string   string being measured for length
196      * @return  integer
197      */
198     public static function strlen($str)
199     {
200         if ( ! isset(self::$called[__FUNCTION__]))
201         {
202             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
203
204             // Function has been called
205             self::$called[__FUNCTION__] = TRUE;
206         }
207
208         return _strlen($str);
209     }
210
211     /**
212      * Finds position of first occurrence of a UTF-8 string.
213      * @see http://php.net/strlen
214      *
215      * @author  Harry Fuecks <hfuecks@gmail.com>
216      *
217      * @param   string   haystack
218      * @param   string   needle
219      * @param   integer  offset from which character in haystack to start searching
220      * @return  integer  position of needle
221      * @return  boolean  FALSE if the needle is not found
222      */
223     public static function strpos($str, $search, $offset = 0)
224     {
225         if ( ! isset(self::$called[__FUNCTION__]))
226         {
227             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
228
229             // Function has been called
230             self::$called[__FUNCTION__] = TRUE;
231         }
232
233         return _strpos($str, $search, $offset);
234     }
235
236     /**
237      * Finds position of last occurrence of a char in a UTF-8 string.
238      * @see http://php.net/strrpos
239      *
240      * @author  Harry Fuecks <hfuecks@gmail.com>
241      *
242      * @param   string   haystack
243      * @param   string   needle
244      * @param   integer  offset from which character in haystack to start searching
245      * @return  integer  position of needle
246      * @return  boolean  FALSE if the needle is not found
247      */
248     public static function strrpos($str, $search, $offset = 0)
249     {
250         if ( ! isset(self::$called[__FUNCTION__]))
251         {
252             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
253
254             // Function has been called
255             self::$called[__FUNCTION__] = TRUE;
256         }
257
258         return _strrpos($str, $search, $offset);
259     }
260
261     /**
262      * Returns part of a UTF-8 string.
263      * @see http://php.net/substr
264      *
265      * @author  Chris Smith <chris@jalakai.co.uk>
266      *
267      * @param   string   input string
268      * @param   integer  offset
269      * @param   integer  length limit
270      * @return  string
271      */
272     public static function substr($str, $offset, $length = NULL)
273     {
274         if ( ! isset(self::$called[__FUNCTION__]))
275         {
276             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
277
278             // Function has been called
279             self::$called[__FUNCTION__] = TRUE;
280         }
281
282         return _substr($str, $offset, $length);
283     }
284
285     /**
286      * Replaces text within a portion of a UTF-8 string.
287      * @see http://php.net/substr_replace
288      *
289      * @author  Harry Fuecks <hfuecks@gmail.com>
290      *
291      * @param   string   input string
292      * @param   string   replacement string
293      * @param   integer  offset
294      * @return  string
295      */
296     public static function substr_replace($str, $replacement, $offset, $length = NULL)
297     {
298         if ( ! isset(self::$called[__FUNCTION__]))
299         {
300             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
301
302             // Function has been called
303             self::$called[__FUNCTION__] = TRUE;
304         }
305
306         return _substr_replace($str, $replacement, $offset, $length);
307     }
308
309     /**
310      * Makes a UTF-8 string lowercase.
311      * @see http://php.net/strtolower
312      *
313      * @author  Andreas Gohr <andi@splitbrain.org>
314      *
315      * @param   string   mixed case string
316      * @return  string
317      */
318     public static function strtolower($str)
319     {
320         if ( ! isset(self::$called[__FUNCTION__]))
321         {
322             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
323
324             // Function has been called
325             self::$called[__FUNCTION__] = TRUE;
326         }
327
328         return _strtolower($str);
329     }
330
331     /**
332      * Makes a UTF-8 string uppercase.
333      * @see http://php.net/strtoupper
334      *
335      * @author  Andreas Gohr <andi@splitbrain.org>
336      *
337      * @param   string   mixed case string
338      * @return  string
339      */
340     public static function strtoupper($str)
341     {
342         if ( ! isset(self::$called[__FUNCTION__]))
343         {
344             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
345
346             // Function has been called
347             self::$called[__FUNCTION__] = TRUE;
348         }
349
350         return _strtoupper($str);
351     }
352
353     /**
354      * Makes a UTF-8 string's first character uppercase.
355      * @see http://php.net/ucfirst
356      *
357      * @author  Harry Fuecks <hfuecks@gmail.com>
358      *
359      * @param   string   mixed case string
360      * @return  string
361      */
362     public static function ucfirst($str)
363     {
364         if ( ! isset(self::$called[__FUNCTION__]))
365         {
366             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
367
368             // Function has been called
369             self::$called[__FUNCTION__] = TRUE;
370         }
371
372         return _ucfirst($str);
373     }
374
375     /**
376      * Makes the first character of every word in a UTF-8 string uppercase.
377      * @see http://php.net/ucwords
378      *
379      * @author  Harry Fuecks <hfuecks@gmail.com>
380      *
381      * @param   string   mixed case string
382      * @return  string
383      */
384     public static function ucwords($str)
385     {
386         if ( ! isset(self::$called[__FUNCTION__]))
387         {
388             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
389
390             // Function has been called
391             self::$called[__FUNCTION__] = TRUE;
392         }
393
394         return _ucwords($str);
395     }
396
397     /**
398      * Case-insensitive UTF-8 string comparison.
399      * @see http://php.net/strcasecmp
400      *
401      * @author  Harry Fuecks <hfuecks@gmail.com>
402      *
403      * @param   string   string to compare
404      * @param   string   string to compare
405      * @return  integer  less than 0 if str1 is less than str2
406      * @return  integer  greater than 0 if str1 is greater than str2
407      * @return  integer  0 if they are equal
408      */
409     public static function strcasecmp($str1, $str2)
410     {
411         if ( ! isset(self::$called[__FUNCTION__]))
412         {
413             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
414
415             // Function has been called
416             self::$called[__FUNCTION__] = TRUE;
417         }
418
419         return _strcasecmp($str1, $str2);
420     }
421
422     /**
423      * Returns a string or an array with all occurrences of search in subject (ignoring case).
424      * replaced with the given replace value.
425      * @see     http://php.net/str_ireplace
426      *
427      * @note    It's not fast and gets slower if $search and/or $replace are arrays.
428      * @author  Harry Fuecks <hfuecks@gmail.com
429      *
430      * @param   string|array  text to replace
431      * @param   string|array  replacement text
432      * @param   string|array  subject text
433      * @param   integer       number of matched and replaced needles will be returned via this parameter which is passed by reference
434      * @return  string        if the input was a string
435      * @return  array         if the input was an array
436      */
437     public static function str_ireplace($search, $replace, $str, & $count = NULL)
438     {
439         if ( ! isset(self::$called[__FUNCTION__]))
440         {
441             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
442
443             // Function has been called
444             self::$called[__FUNCTION__] = TRUE;
445         }
446
447         return _str_ireplace($search, $replace, $str, $count);
448     }
449
450     /**
451      * Case-insenstive UTF-8 version of strstr. Returns all of input string
452      * from the first occurrence of needle to the end.
453      * @see http://php.net/stristr
454      *
455      * @author Harry Fuecks <hfuecks@gmail.com>
456      *
457      * @param   string   input string
458      * @param   string   needle
459      * @return  string   matched substring if found
460      * @return  boolean  FALSE if the substring was not found
461      */
462     public static function stristr($str, $search)
463     {
464         if ( ! isset(self::$called[__FUNCTION__]))
465         {
466             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
467
468             // Function has been called
469             self::$called[__FUNCTION__] = TRUE;
470         }
471
472         return _stristr($str, $search);
473     }
474
475     /**
476      * Finds the length of the initial segment matching mask.
477      * @see http://php.net/strspn
478      *
479      * @author Harry Fuecks <hfuecks@gmail.com>
480      *
481      * @param   string   input string
482      * @param   string   mask for search
483      * @param   integer  start position of the string to examine
484      * @param   integer  length of the string to examine
485      * @return  integer  length of the initial segment that contains characters in the mask
486      */
487     public static function strspn($str, $mask, $offset = NULL, $length = NULL)
488     {
489         if ( ! isset(self::$called[__FUNCTION__]))
490         {
491             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
492
493             // Function has been called
494             self::$called[__FUNCTION__] = TRUE;
495         }
496
497         return _strspn($str, $mask, $offset, $length);
498     }
499
500     /**
501      * Finds the length of the initial segment not matching mask.
502      * @see http://php.net/strcspn
503      *
504      * @author  Harry Fuecks <hfuecks@gmail.com>
505      *
506      * @param   string   input string
507      * @param   string   mask for search
508      * @param   integer  start position of the string to examine
509      * @param   integer  length of the string to examine
510      * @return  integer  length of the initial segment that contains characters not in the mask
511      */
512     public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
513     {
514         if ( ! isset(self::$called[__FUNCTION__]))
515         {
516             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
517
518             // Function has been called
519             self::$called[__FUNCTION__] = TRUE;
520         }
521
522         return _strcspn($str, $mask, $offset, $length);
523     }
524
525     /**
526      * Pads a UTF-8 string to a certain length with another string.
527      * @see http://php.net/str_pad
528      *
529      * @author  Harry Fuecks <hfuecks@gmail.com>
530      *
531      * @param   string   input string
532      * @param   integer  desired string length after padding
533      * @param   string   string to use as padding
534      * @param   string   padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
535      * @return  string
536      */
537     public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT)
538     {
539         if ( ! isset(self::$called[__FUNCTION__]))
540         {
541             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
542
543             // Function has been called
544             self::$called[__FUNCTION__] = TRUE;
545         }
546
547         return _str_pad($str, $final_str_length, $pad_str, $pad_type);
548     }
549
550     /**
551      * Converts a UTF-8 string to an array.
552      * @see http://php.net/str_split
553      *
554      * @author  Harry Fuecks <hfuecks@gmail.com>
555      *
556      * @param   string   input string
557      * @param   integer  maximum length of each chunk
558      * @return  array
559      */
560     public static function str_split($str, $split_length = 1)
561     {
562         if ( ! isset(self::$called[__FUNCTION__]))
563         {
564             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
565
566             // Function has been called
567             self::$called[__FUNCTION__] = TRUE;
568         }
569
570         return _str_split($str, $split_length);
571     }
572
573     /**
574      * Reverses a UTF-8 string.
575      * @see http://php.net/strrev
576      *
577      * @author  Harry Fuecks <hfuecks@gmail.com>
578      *
579      * @param   string   string to be reversed
580      * @return  string
581      */
582     public static function strrev($str)
583     {
584         if ( ! isset(self::$called[__FUNCTION__]))
585         {
586             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
587
588             // Function has been called
589             self::$called[__FUNCTION__] = TRUE;
590         }
591
592         return _strrev($str);
593     }
594
595     /**
596      * Strips whitespace (or other UTF-8 characters) from the beginning and
597      * end of a string.
598      * @see http://php.net/trim
599      *
600      * @author  Andreas Gohr <andi@splitbrain.org>
601      *
602      * @param   string   input string
603      * @param   string   string of characters to remove
604      * @return  string
605      */
606     public static function trim($str, $charlist = NULL)
607     {
608         if ( ! isset(self::$called[__FUNCTION__]))
609         {
610             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
611
612             // Function has been called
613             self::$called[__FUNCTION__] = TRUE;
614         }
615
616         return _trim($str, $charlist);
617     }
618
619     /**
620      * Strips whitespace (or other UTF-8 characters) from the beginning of a string.
621      * @see http://php.net/ltrim
622      *
623      * @author  Andreas Gohr <andi@splitbrain.org>
624      *
625      * @param   string   input string
626      * @param   string   string of characters to remove
627      * @return  string
628      */
629     public static function ltrim($str, $charlist = NULL)
630     {
631         if ( ! isset(self::$called[__FUNCTION__]))
632         {
633             require SYSPATH.'core/utf8/'.__FUNCTION__.EXT;
634
635             // Function has been called
636             self::$called[__FUNCTION__] = TRUE;
637         }
638
639         return _ltrim($str, $charlist);
640     }
641
642     /**
643      * Strips whitespace (or other UTF-8 characters) from the end of a string.
644      * @see http://php.net/rtrim
645      *
646      * @author  Andreas Gohr <andi@splitbrain.org>
647      *