This is the mail archive of the
libstdc++@gcc.gnu.org
mailing list for the libstdc++ project.
Re: UTF-8 support - char or wchar_t?
Ole Laursen wrote:
Hi,
Paolo Carlini <pcarlini@suse.de> writes:
Carlo Wood wrote:
Can you please explain why one needs wchar_t (wcout)
at all when using UTF-8? I'd expect that UTF-8 fits
in a stream of 8-bit octets and no wchar_t should
be needed at all.
UTF-8 fits in a stream of 8-bit octets of the *external* encoding!
How do you produce this *external* UTF-8 encoded stream?
The standard way is using an *internal* wchar_t representation
(basically, on GNU systems is UCS4, see the glibc docs), then
exploiting the specialization codecvt<wchar_t, char, mbstate_t>.
So how do you do that? I currently use an ostringstream, how would I
go about getting a std::string in the encoding of the current locale?
You can't. stringbuf doesn't use codecvt, only filebuf does. It's
kind of a pain. One way to do it is to write the data to a file
(or a socket) using an ofstream (or wofstream) with your locale
imbued in it, then read it back in using a basic_ifstream<char>
with the "C" locale imbued in it. The more efficient but more
cumbersome way to do it (w/o file I/O) is to use the codecvt
facet directly. The most portable way to do this, though, is
to forget about codecvt and use iconv directly. Attached is
a simple wrapper function I once wrote for someone who didn't
like the clunky iconv interface. Read the pages below for more
info on iconv:
http://www.opengroup.org/onlinepubs/009695399/functions/iconv.html
Martin
#include <iconv.h>
#include <langinfo.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <iostream>
#include <string>
size_t my_iconv (iconv_t cd, const std::string &from, std::string &to)
{
size_t inbytesleft = from.size ();
to.clear ();
if (!inbytesleft)
return 0;
std::string res;
res.resize (from.size ());
const char *inbuf = from.data ();
char *outbuf = &res [0];
size_t outbytesleft = to.size ();
size_t outsize = 0;
for (; inbuf != from.data () + from.size (); ) {
size_t ret = iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
outsize = outbuf - res.data ();
if (size_t (-1) == ret && E2BIG == errno) {
res.resize ((outsize + 1) * 2);
outbuf = &res [0] + outsize;
outbytesleft = res.size () - outsize;
errno = 0;
}
else
break;
}
res.resize (outsize);
to = res;
return inbuf - from.data ();
}
int main (int argc, char *argv[])
{
const char *from_code;
const char *to_code;
if (argc > 1)
from_code = argv [1];
else
from_code = nl_langinfo (CODESET);
if (argc > 2)
to_code = argv [2];
else
to_code = "";
std::istream istrm (std::cin.rdbuf ());
if (argc > 3) {
std::filebuf *fb = new std::filebuf ();
if (!fb->open (argv [3], std::ios::in)) {
fprintf (stderr, "fopen (\"%s\", \"r\") failed: %s\n",
argv [3], strerror (errno));
return 2;
}
istrm.rdbuf (fb);
}
iconv_t cd = iconv_open (from_code, to_code);
if (iconv_t (-1) == cd) {
fprintf (stderr, "iconv_open (\"%s\", \"%s\") failed: %s\n",
from_code, to_code, strerror (errno));
return 1;
}
for ( ; ; ) {
std::string from;
std::string to;
if (!std::getline (istrm, from))
break;
const size_t ret = my_iconv (cd, from, to);
std::cout << to << '\n';
if (ret != from.size ()) {
char str [5];
if (from [ret] < ' ' || from [ret] > '~')
sprintf (str, "\\%03o", (unsigned char)from [ret]);
else {
str [0] = from [ret];
str [1] = '\0';
}
fprintf (stderr, "conversion error at offset %d ('%s'): %s\n",
ret, str, strerror (errno));
break;
}
}
iconv_close (cd);
return 0;
}