# phpman > perldoc > Unicode::String

## NAME
    [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown) - String of Unicode characters (UTF-16BE)

## SYNOPSIS
     use [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown) qw(utf8 latin1 utf16be);

     $u = utf8("string");
     $u = latin1("string");
     $u = utf16be("\0s\0t\0r\0i\0n\0g");

     print $u->utf32be;   # 4 byte characters
     print $u->utf16le;   # 2 byte characters + surrogates
     print $u->utf8;      # 1-4 byte characters

## DESCRIPTION
    A "[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)" object represents a sequence of Unicode characters. Methods are provided to
    convert between various external formats (encodings) and "[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)" objects, and methods
    are provided for common string manipulations.

    The functions utf32be(), utf32le(), utf16be(), utf16le(), utf8(), utf7(), latin1(), uhex(),
### uchr
    initializing strings of the corresponding encoding.

    The "[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)" objects overload various operators, which means that they in most cases
    can be treated like plain strings.

    Internally a "[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)" object is represented by a string of 2 byte numbers in network
    byte order (big-endian). This representation is not visible by the API provided, but it might be
    useful to know in order to predict the efficiency of the provided methods.

  METHODS
### Class methods
    The following class methods are available:

    [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)->stringify_as
    [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)->stringify_as( $enc )
        This method is used to specify which encoding will be used when "[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)" objects
        are implicitly converted to and from plain strings.

        If an argument is provided it sets the current encoding. The argument should have one of the
        following: "ucs4", "utf32", "utf32be", "utf32le", "ucs2", "utf16", "utf16be", "utf16le",
        "utf8", "utf7", "latin1" or "hex". The default is "utf8".

        The stringify_as() method returns a reference to the current encoding function.

    $us = [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)->new
    $us = [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)->new( $initial_value )
        This is the object constructor. Without argument, it creates an empty "[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)"
        object. If an $initial_value argument is given, it is decoded according to the specified
        stringify_as() encoding, UTF-8 by default.

        In general it is recommended to import and use one of the encoding specific constructor
        functions instead of invoking this method.

### Encoding methods
þÿ         T h e s e   m e t h o d s   g e t   o r   s e t   t h e   v a l u e   o f   t h e   " U n i c o d e : : S t r i n g "   o b j e c t   b y   p a s s i n g   s t r i n g s   i n   t h e 
         c o r r e s p o n d i n g   e n c o d i n g .   I f   a   n e w   v a l u e   i s   p a s s e d   a s   a r g u m e n t   i t   w i l l   s e t   t h e   v a l u e   o f   t h e 
         " U n i c o d e : : S t r i n g " ,   a n d   t h e   p r e v i o u s   v a l u e   i s   r e t u r n e d .   I f   n o   a r g u m e n t   i s   p a s s e d   t h e n   t h e   c u r r e n t 
         v a l u e   i s   r e t u r n e d . 
 
        To illustrate the encodings we show how the 2 character sample strin  of "µm" (micro meter) is encoded for each one. 
 
        =over  
 
        =item $us->utf32be 
 
        =item $us->utf32be( $newval  
 
        The string passed should be in the UTF-32 encoding with bytes in big  endian order.  The sample "µm" is "\0\0\0\xB5\0\0\0m" in this encoding 
 
        Alternative names for this method are utf32() and ucs4() 
 
        =item $us->utf32le 
 
        =item $us->utf32le( $newval  
 
        The string passed should be in the UTF-32 encoding with bytes in littl  endian order.  The sample "µm" is is "\xB5\0\0\0m\0\0\0" in this encoding. 
 
        =item $us->utf16be 
 
        =item $us->utf16be( $newval  
 
        The string passed should be in the UTF-16 encoding with bytes in big  endian order. The sample "µm" is "\0\xB5\0m" in this encoding. 
 
        Alternative names for this method are utf16() and ucs2() 
 
        If the string passed to utf16be() starts with the Unicode byte order  mark in little endian order, the result is as if utf16le() was calle  instead. 
 
        =item $us->utf16le 
 
        =item $us->utf16le( $newval  
 
        The string passed should be in the UTF-16 encoding with bytes in  little endian order.  The sample "µm" is is "\xB5\0m\0" in thi 
        encoding.  This is the encoding used by the Microsoft Windows API. 
 
        If the string passed to utf16le() starts with the Unicode byte order  mark in big endian order, the result is as if utf16le() was called  instead. 
 
        =item $us->utf 
 
        =item $us->utf8( $newval 
 
        The string passed should be in the UTF-8 encoding. The sample "µm" i  "\xC2\xB5m" in this encoding 
 
        =item $us->utf 
 
        =item $us->utf7( $newval 
 
        The string passed should be in the UTF-7 encoding. The sample "µm" i  "+ALU-m" in this encoding. 
 
        The UTF-7 encoding only use plain US-ASCII characters for th  encoding.  This makes it safe for transport through 8-bit strippin 
        protocols.  Characters outside the US-ASCII range are base64-encoded  and '+' is used as an escape character.  The UTF-7 encoding is  described in RFC 1642. 
 
        If the (global) variable $[Unicode::String::UTF7_OPTIONAL_DIRECT_CHAR](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString%3A%3AUTF7OPTIONALDIRECTCHAR/markdown)  is TRUE, then a wider range of characters are encoded as themselves. 
        It is even TRUE by default.  The characters affected by this are 
 
           ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } 
 
        =item $us->latin 
 
        =item $us->latin1( $newval 
 
        The string passed should be in the ISO-8859-1 encoding. The sample "µm" is  "\xB5m" in this encoding 
 
        Characters outside the "\x00" .. "\xFF" range are simply removed fro  the return value of the latin1() method.  If you want more control 
        over the mapping from Unicode to ISO-8859-1, use the C<[Unicode::Map8](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AMap8/markdown)  class.  This is also the way to deal with other 8-bit character sets 
 
        =item $us->hex 
 
        =item $us->hex( $newval  
 
        The string passed should be plain ASCII where each Unicode character  is represented by the "U+XXXX" string and separated by a single spac 
        character.  The "U+" prefix is optional when setting the value.  The  sample "µm" is "U+00b5 U+006d" in this encoding. 
 
        =bac 
 
        =head2 String Operations 
 
        The following methods are available: 
 
        =over  
 
        =item $us->as_string 
 
        Converts a C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> to a plain string according to the  setting of stringify_as().  The default stringify_as() encoding is  "utf8" 
 
        =item $us->as_nu 
 
        Converts a C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> to a number.  Currently only the digit  in the range 0x30 .. 0x39 are recognized.  The plan is to eventually  support all Unicode digit characters 
 
        =item $us->as_bool 
 
        Converts a C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> to a boolean value.  Only the empt  string is FALSE.  A string consisting of only the character U+0030 i  considered TRUE, even if Perl consider "0" to be FALSE 
 
        =item $us->repeat( $count  
 
        Returns a new C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> where the content of $us is repeate  $count times.  This operation is also overloaded as: 
 
          $us x $count 
 
        =item $us->concat( $other_string 
 
        Concatenates the string $us and the string $other_string.  I  $other_string is not an C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object, then it is firs  passed to the [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)->new constructor function.  This 
        operation is also overloaded as: 
 
          $us . $other_strin 
 
        =item $us->append( $other_string 
 
        Appends the string $other_string to the value of $us.  I  $other_string is not an C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object, then it is firs  passed to the [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)->new constructor function.  This 
        operation is also overloaded as: 
 
          $us .= $other_string 
 
        =item $us->cop 
 
        Returns a copy of the current C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object.  This  operation is overloaded as the assignment operator 
 
        =item $us->lengt 
 
        Returns the length of the C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)>.  Surrogate pairs are  still counted as 2 
 
        =item $us->byteswa 
 
        This method will swap the bytes in the internal representation of th  C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object. 
 
        Unicode reserve the character U+FEFF character as a byte order mark.  This works because the swapped character, U+FFFE, is reserved to not 
        be valid.  For strings that have the byte order mark as the firs  character, we can guaranty to get the byte order right with th  following code 
 
           $ustr->byteswap if $ustr->ord == 0xFFFE 
 
        =item $us->unpac 
 
        Returns a list of integers each representing an UCS-2 character code 
 
        =item $us->pack( @uchr 
 
        Sets the value of $us as a sequence of UCS-2 characters with the  characters codes given as parameter. 
 
        =item $us->ord 
 
        Returns the character code of the first character in $us.  The ord()  method deals with surrogate pairs, which gives us a result-range o 
        0x0 .. 0x10FFFF.  If the $us string is empty, undef is returned. 
 
        =item $us->chr( $code  
 
        Sets the value of $us to be a string containing the character assigned  code $code.  The argument $code must be an integer in the range 0x 
        .. 0x10FFFF.  If the code is greater than 0xFFFF then a surrogate pair  created. 
 
        =item $us->nam 
 
        In scalar context returns the official Unicode name of the first  character in $us.  In array context returns the name of all characters  in $us.  Also see L<[Unicode::CharName](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3ACharName/markdown)> 
 
        =item $us->substr( $offset 
 
        =item $us->substr( $offset, $length  
 
        =item $us->substr( $offset, $length, $subst  
 
        Returns a sub-string of $us.  Works similar to the builtin substr(  function 
 
        =item $us->index( $other 
 
        =item $us->index( $other, $pos 
 
        Locates the position of $other within $us, possibly starting the  search at position $pos. 
 
        =item $us->cho 
 
        Chops off the last character of $us and returns it (as a  C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object) 
 
        =bac 
 
        =head1 FUNCTIONS 
 
        The following functions are provided.  None of these are exported by default 
 
        =over  
 
        =item byteswap2( $str, ... 
 
        This function will swap 2 and 2 bytes in the strings passed as  arguments.  If this function is called in void context  then it will modify its arguments in-place.  Otherwise, the swappe 
        strings are returned 
 
        =item byteswap4( $str, ... 
 
        The byteswap4 function works similar to byteswap2, but will revers  the order of 4 and 4 bytes 
 
        =item latin1( $str 
 
        =item utf7( $str 
 
        =item utf8( $str 
 
        =item utf16le( $str  
 
        =item utf16be( $str  
 
        =item utf32le( $str  
 
        =item utf32be( $str  
 
        Constructor functions for the various Unicode encodings.  These return  new C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> objects.  The provided argument should be  encoded correspondingly. 
 
        =item uhex( $str 
 
        Constructs a new C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object from a string of he  values.  See hex() method above for description of the format. 
 
        =item uchar( $num  
 
        Constructs a new one character C<[Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)> object from   Unicode character code.  This works similar to perl's builtin chr(  function 
 
        =bac 
 
        =head1 SEE ALS 
 
        L<[Unicode::CharName](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3ACharName/markdown)>  L<[Unicode::Map8](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AMap8/markdown)> 
 
        L<<http://www.unicode.org/>> 
 
        L<perlunicode> 
 
        =head1 COPYRIGHT 
 
        Copyright 1997-2000,2005 Gisle Aas 
 
        This library is free software; you can redistribute it and/o  modify it under the same terms as Perl itself. 
 
        =cut 
 
        # Some old code that is not used any more (because the methods are  # now implemented as XS) and which I did not want to throw away yet. 
 
        sub ucs4_inper 
 
            my $self = shift      unless (ref $self) {  	my $u = new [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)  	$u->ucs4($self)  	return $u            my $old = pack("N*", $self->ord)      if (@_)   	$$self = "" 
        	for (unpack("N*", shift)) {  	    $self->append(uchr($_))  	}            $old 
 
        sub utf8_inper 
 
            my $self = shift      unless (ref $self) {  	# act as ctor  	my $u = new [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)  	$u->utf8($self)  	return $u       
 
            my $old;      if (defined($$self) && defined wantarray)   	# encode UTF-  	my $uc;  	for $uc (unpack("n*", $$self))   	    if ($uc < 0x80) {  		# 1 byte representatio  		$old .= chr($uc) 
        	    } elsif ($uc < 0x800) {  		# 2 byte representatio  		$old .= chr(0xC0 | ($uc >> 6)) .                          chr(0x80 | ($uc & 0x3F))  	    } else   		# 3 byte representatio 
        		$old .= chr(0xE0 | ($uc >> 12))   		        chr(0x80 | (($uc >> 6) & 0x3F))   			chr(0x80 | ($uc & 0x3F));  	    }  	}       
 
            if (@_)   	if (defined $_[0])   	    $$self = ""  	    my $bytes = shift  	    $bytes =~ s/^[\200-\277]+//;  # can't start with 10xxxxxx  	    while (length $bytes) { 
        		if ($bytes =~ s/^([\000-\177]+)//) {  		    $$self .= pack("n*", unpack("C*", $1))  		} elsif ($bytes =~ s/^([\300-\337])([\200-\277])//)   		    my($b1,$b2) = (ord($1), ord($2)) 
        		    $$self .= pack("n", (($b1 & 0x1F) << 6) | ($b2 & 0x3F));  		} elsif ($bytes =~ s/^([\340-\357])([\200-\277])([\200-\277])//) {  		    my($b1,$b2,$b3) = (ord($1), ord($2), ord($3)); 
        		    $$self .= pack("n", (($b1 & 0x0F) << 12) |                                          (($b2 & 0x3F) <<  6) |  				         ($b3 & 0x3F))  		} else {  		    croak "Bad UTF-8 data"  		 
        	    }  	} else   	    $$self = undef;  	}       
 
            $old 
 
        sub latin1_inper 
 
            my $self = shift      unless (ref $self) {  	# act as ctor  	my $u = new [Unicode::String](https://www.chedong.com/phpMan.php/perldoc/Unicode%3A%3AString/markdown)  	$u->latin1($self)  	return $u       
 
            my $old;      # XXX: should really check that none of the chars > 25      $old = pack("C*", unpack("n*", $$self)) if defined $$self; 
 
            if (@_)   	# set the value  	if (defined $_[0])   	    $$self = pack("n*", unpack("C*", $_[0]));  	} else   	    $$self = undef;  	}            $old 
 
