This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: UTF-8 support - char or wchar_t?


Ole Laursen wrote:

Hi,

Paolo Carlini <pcarlini@suse.de> writes:


Carlo Wood wrote:


Can you please explain why one needs wchar_t (wcout)
at all when using UTF-8?  I'd expect that UTF-8 fits
in a stream of 8-bit octets and no wchar_t should
be needed at all.


UTF-8 fits in a stream of 8-bit octets of the *external* encoding! How do you produce this *external* UTF-8 encoded stream?

The standard way is using an *internal* wchar_t representation
(basically, on GNU systems is UCS4, see the glibc docs), then
exploiting the specialization codecvt<wchar_t, char, mbstate_t>.


So how do you do that? I currently use an ostringstream, how would I
go about getting a std::string in the encoding of the current locale?

You can't. stringbuf doesn't use codecvt, only filebuf does. It's kind of a pain. One way to do it is to write the data to a file (or a socket) using an ofstream (or wofstream) with your locale imbued in it, then read it back in using a basic_ifstream<char> with the "C" locale imbued in it. The more efficient but more cumbersome way to do it (w/o file I/O) is to use the codecvt facet directly. The most portable way to do this, though, is to forget about codecvt and use iconv directly. Attached is a simple wrapper function I once wrote for someone who didn't like the clunky iconv interface. Read the pages below for more info on iconv: http://www.opengroup.org/onlinepubs/009695399/functions/iconv.html

Martin

#include <iconv.h>
#include <langinfo.h>

#include <errno.h>
#include <stdio.h>
#include <string.h>

#include <fstream>
#include <iostream>
#include <string>


size_t my_iconv (iconv_t cd, const std::string &from, std::string &to)
{
    size_t inbytesleft = from.size ();

    to.clear ();

    if (!inbytesleft)
        return 0;

    std::string res;

    res.resize (from.size ());

    const char *inbuf = from.data ();
    char *outbuf = &res [0];

    size_t outbytesleft = to.size ();

    size_t outsize = 0;

    for (; inbuf != from.data () + from.size (); ) {

        size_t ret = iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);

        outsize = outbuf - res.data ();

        if (size_t (-1) == ret && E2BIG == errno) {
            res.resize ((outsize + 1) * 2);
            outbuf = &res [0] + outsize;
            outbytesleft = res.size () - outsize;
            errno = 0;
        }
        else
            break;
    }

    res.resize (outsize);
    to = res;

    return inbuf - from.data ();
}


int main (int argc, char *argv[])
{
    const char *from_code;
    const char *to_code;

    if (argc > 1)
        from_code = argv [1];
    else
        from_code = nl_langinfo (CODESET);

    if (argc > 2)
        to_code = argv [2];
    else
        to_code = "";

    std::istream istrm (std::cin.rdbuf ());

    if (argc > 3) {
        std::filebuf *fb = new std::filebuf ();
        if (!fb->open (argv [3], std::ios::in)) {
            fprintf (stderr, "fopen (\"%s\", \"r\") failed: %s\n",
                     argv [3], strerror (errno));
            return 2;
        }

        istrm.rdbuf (fb);
    }

    iconv_t cd = iconv_open (from_code, to_code);

    if (iconv_t (-1) == cd) {
        fprintf (stderr, "iconv_open (\"%s\", \"%s\") failed: %s\n",
                 from_code, to_code, strerror (errno));
        return 1;
    }

    for ( ; ; ) {

        std::string from;
        std::string to;

        if (!std::getline (istrm, from))
            break;

        const size_t ret = my_iconv (cd, from, to);

        std::cout << to << '\n';

        if (ret != from.size ()) {

            char str [5];
            if (from [ret] < ' ' || from [ret] > '~')
                sprintf (str, "\\%03o", (unsigned char)from [ret]);
            else {
                str [0] = from [ret];
                str [1] = '\0';
            }

            fprintf (stderr, "conversion error at offset %d ('%s'): %s\n",
                     ret, str, strerror (errno));

            break;
        }
    }

    iconv_close (cd);

    return 0;
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]