]>
Commit | Line | Data |
---|---|---|
56f48ce9 DB |
1 | /* Multibyte Character Functions. |
2 | Copyright (C) 1998 Free Software Foundation, Inc. | |
3 | ||
4 | This file is part of GNU CC. | |
5 | ||
6 | GNU CC is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | GNU CC is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GNU CC; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 59 Temple Place - Suite 330, | |
19 | Boston, MA 02111-1307, USA. */ | |
20 | ||
21 | /* These functions are used to manipulate multibyte characters. */ | |
22 | ||
23 | /* Note regarding cross compilation: | |
24 | ||
25 | In general translation of multibyte characters to wide characters can | |
26 | only work in a native compiler since the translation function (mbtowc) | |
27 | needs to know about both the source and target character encoding. However, | |
28 | this particular implementation for JIS, SJIS and EUCJP source characters | |
29 | will work for any compiler with a newlib target. Other targets may also | |
30 | work provided that their wchar_t implementation is 2 bytes and the encoding | |
31 | leaves the source character values unchanged (except for removing the | |
32 | state shifting markers). */ | |
33 | ||
34 | #ifdef MULTIBYTE_CHARS | |
35 | #include "config.h" | |
36 | #include "system.h" | |
37 | #include "gansidecl.h" | |
38 | #include "mbchar.h" | |
39 | #include <locale.h> | |
40 | ||
41 | typedef enum | |
42 | { | |
43 | ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER, JIS_C_NUM | |
44 | } JIS_CHAR_TYPE; | |
45 | ||
46 | typedef enum | |
47 | { | |
48 | ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR, | |
49 | J2_ESC, J2_ESC_BR, INV, JIS_S_NUM | |
50 | } JIS_STATE; | |
51 | ||
52 | typedef enum | |
53 | { | |
54 | COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP, EMPTY, ERROR | |
55 | } JIS_ACTION; | |
56 | ||
57 | /***************************************************************************** | |
58 | * state/action tables for processing JIS encoding | |
59 | * Where possible, switches to JIS are grouped with proceding JIS characters | |
60 | * and switches to ASCII are grouped with preceding JIS characters. | |
61 | * Thus, maximum returned length is: | |
62 | * 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. | |
63 | *****************************************************************************/ | |
64 | static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = { | |
65 | /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER*/ | |
66 | /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, | |
67 | /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, | |
68 | /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII}, | |
69 | /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV }, | |
70 | /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV }, | |
71 | /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS }, | |
72 | /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV }, | |
73 | /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, | |
74 | /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV }, | |
75 | /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, | |
76 | }; | |
77 | ||
78 | static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = { | |
79 | /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */ | |
80 | /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA}, | |
81 | /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA}, | |
82 | /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA}, | |
83 | /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR }, | |
84 | /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR }, | |
85 | /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2}, | |
86 | /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, | |
87 | /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR }, | |
88 | /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, | |
89 | /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR }, | |
90 | }; | |
91 | ||
92 | ||
93 | char *literal_codeset = NULL; | |
94 | ||
95 | int | |
96 | local_mbtowc (pwc, s, n) | |
97 | wchar_t *pwc; | |
98 | const char *s; | |
99 | size_t n; | |
100 | { | |
101 | static JIS_STATE save_state = ASCII; | |
102 | JIS_STATE curr_state = save_state; | |
103 | unsigned char *t = (unsigned char *)s; | |
104 | ||
105 | if (s != NULL && n == 0) | |
106 | return -1; | |
107 | ||
108 | if (literal_codeset == NULL || strlen (literal_codeset) <= 1) | |
109 | { | |
110 | /* This must be the "C" locale or unknown locale -- fall thru */ | |
111 | } | |
112 | else if (! strcmp (literal_codeset, "C-SJIS")) | |
113 | { | |
114 | int char1; | |
115 | if (s == NULL) | |
116 | return 0; /* not state-dependent */ | |
117 | char1 = *t; | |
118 | if (ISSJIS1 (char1)) | |
119 | { | |
120 | int char2 = t[1]; | |
121 | if (n <= 1) | |
122 | return -1; | |
123 | if (ISSJIS2 (char2)) | |
124 | { | |
125 | if (pwc != NULL) | |
126 | *pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1)); | |
127 | return 2; | |
128 | } | |
129 | return -1; | |
130 | } | |
131 | if (pwc != NULL) | |
132 | *pwc = (wchar_t)*t; | |
133 | if (*t == '\0') | |
134 | return 0; | |
135 | return 1; | |
136 | } | |
137 | else if (! strcmp (literal_codeset, "C-EUCJP")) | |
138 | { | |
139 | int char1; | |
140 | if (s == NULL) | |
141 | return 0; /* not state-dependent */ | |
142 | char1 = *t; | |
143 | if (ISEUCJP (char1)) | |
144 | { | |
145 | int char2 = t[1]; | |
146 | if (n <= 1) | |
147 | return -1; | |
148 | if (ISEUCJP (char2)) | |
149 | { | |
150 | if (pwc != NULL) | |
151 | *pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1)); | |
152 | return 2; | |
153 | } | |
154 | return -1; | |
155 | } | |
156 | if (pwc != NULL) | |
157 | *pwc = (wchar_t)*t; | |
158 | if (*t == '\0') | |
159 | return 0; | |
160 | return 1; | |
161 | } | |
162 | else if (! strcmp (literal_codeset, "C-JIS")) | |
163 | { | |
164 | JIS_ACTION action; | |
165 | JIS_CHAR_TYPE ch; | |
166 | unsigned char *ptr; | |
167 | int i, curr_ch; | |
168 | ||
169 | if (s == NULL) | |
170 | { | |
171 | save_state = ASCII; | |
172 | return 1; /* state-dependent */ | |
173 | } | |
174 | ||
175 | ptr = t; | |
176 | ||
177 | for (i = 0; i < n; ++i) | |
178 | { | |
179 | curr_ch = t[i]; | |
180 | switch (curr_ch) | |
181 | { | |
182 | case JIS_ESC_CHAR: | |
183 | ch = ESCAPE; | |
184 | break; | |
185 | case '$': | |
186 | ch = DOLLAR; | |
187 | break; | |
188 | case '@': | |
189 | ch = AT; | |
190 | break; | |
191 | case '(': | |
192 | ch = BRACKET; | |
193 | break; | |
194 | case 'B': | |
195 | ch = B; | |
196 | break; | |
197 | case 'J': | |
198 | ch = J; | |
199 | break; | |
200 | case '\0': | |
201 | ch = NUL; | |
202 | break; | |
203 | default: | |
204 | if (ISJIS (curr_ch)) | |
205 | ch = JIS_CHAR; | |
206 | else | |
207 | ch = OTHER; | |
208 | } | |
209 | ||
210 | action = JIS_action_table[curr_state][ch]; | |
211 | curr_state = JIS_state_table[curr_state][ch]; | |
212 | ||
213 | switch (action) | |
214 | { | |
215 | case NOOP: | |
216 | break; | |
217 | case EMPTY: | |
218 | if (pwc != NULL) | |
219 | *pwc = (wchar_t)0; | |
220 | save_state = curr_state; | |
221 | return i; | |
222 | case COPYA: | |
223 | if (pwc != NULL) | |
224 | *pwc = (wchar_t)*ptr; | |
225 | save_state = curr_state; | |
226 | return (i + 1); | |
227 | case COPYJ: | |
228 | if (pwc != NULL) | |
229 | *pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1)); | |
230 | save_state = curr_state; | |
231 | return (i + 1); | |
232 | case COPYJ2: | |
233 | if (pwc != NULL) | |
234 | *pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1)); | |
235 | save_state = curr_state; | |
236 | return (ptr - t) + 2; | |
237 | case MAKE_A: | |
238 | case MAKE_J: | |
239 | ptr = (char *)(t + i + 1); | |
240 | break; | |
241 | case ERROR: | |
242 | default: | |
243 | return -1; | |
244 | } | |
245 | } | |
246 | ||
247 | return -1; /* n < bytes needed */ | |
248 | } | |
249 | ||
250 | #ifdef CROSS_COMPILE | |
251 | if (s == NULL) | |
252 | return 0; /* not state-dependent */ | |
253 | if (pwc != NULL) | |
254 | *pwc = *s; | |
255 | return 1; | |
256 | #else | |
257 | /* This must be the "C" locale or unknown locale. */ | |
258 | return mbtowc (pwc, s, n); | |
259 | #endif | |
260 | } | |
261 | ||
262 | int | |
263 | local_mblen (s, n) | |
264 | const char *s; | |
265 | size_t n; | |
266 | { | |
267 | return local_mbtowc (NULL, s, n); | |
268 | } | |
269 | ||
270 | int | |
271 | local_mb_cur_max () | |
272 | { | |
273 | if (literal_codeset == NULL || strlen (literal_codeset) <= 1) | |
274 | ; | |
275 | else if (! strcmp (literal_codeset, "C-SJIS")) | |
276 | return 2; | |
277 | else if (! strcmp (literal_codeset, "C-EUCJP")) | |
278 | return 2; | |
279 | else if (! strcmp (literal_codeset, "C-JIS")) | |
280 | return 8; /* 3 + 2 + 3 */ | |
281 | ||
282 | #ifdef CROSS_COMPILE | |
283 | return 1; | |
284 | #else | |
285 | return MB_CUR_MAX; | |
286 | #endif | |
287 | } | |
288 | #endif /* MULTIBYTE_CHARS */ |