]>
Commit | Line | Data |
---|---|---|
56f48ce9 DB |
1 | /* Multibyte Character Functions. |
2 | Copyright (C) 1998 Free Software Foundation, Inc. | |
3 | ||
4 | This file is part of GNU CC. | |
5 | ||
6 | GNU CC is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 2, or (at your option) | |
9 | any later version. | |
10 | ||
11 | GNU CC is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GNU CC; see the file COPYING. If not, write to | |
18 | the Free Software Foundation, 59 Temple Place - Suite 330, | |
19 | Boston, MA 02111-1307, USA. */ | |
20 | ||
21 | /* These functions are used to manipulate multibyte characters. */ | |
22 | ||
23 | /* Note regarding cross compilation: | |
24 | ||
25 | In general translation of multibyte characters to wide characters can | |
26 | only work in a native compiler since the translation function (mbtowc) | |
27 | needs to know about both the source and target character encoding. However, | |
28 | this particular implementation for JIS, SJIS and EUCJP source characters | |
29 | will work for any compiler with a newlib target. Other targets may also | |
30 | work provided that their wchar_t implementation is 2 bytes and the encoding | |
31 | leaves the source character values unchanged (except for removing the | |
32 | state shifting markers). */ | |
33 | ||
34 | #ifdef MULTIBYTE_CHARS | |
35 | #include "config.h" | |
36 | #include "system.h" | |
56f48ce9 DB |
37 | #include "mbchar.h" |
38 | #include <locale.h> | |
39 | ||
40 | typedef enum | |
41 | { | |
42 | ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER, JIS_C_NUM | |
43 | } JIS_CHAR_TYPE; | |
44 | ||
45 | typedef enum | |
46 | { | |
47 | ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR, | |
48 | J2_ESC, J2_ESC_BR, INV, JIS_S_NUM | |
49 | } JIS_STATE; | |
50 | ||
51 | typedef enum | |
52 | { | |
53 | COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP, EMPTY, ERROR | |
54 | } JIS_ACTION; | |
55 | ||
56 | /***************************************************************************** | |
57 | * state/action tables for processing JIS encoding | |
58 | * Where possible, switches to JIS are grouped with proceding JIS characters | |
59 | * and switches to ASCII are grouped with preceding JIS characters. | |
60 | * Thus, maximum returned length is: | |
61 | * 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. | |
62 | *****************************************************************************/ | |
63 | static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = { | |
64 | /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER*/ | |
65 | /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, | |
66 | /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII}, | |
67 | /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII}, | |
68 | /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV }, | |
69 | /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV }, | |
70 | /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS }, | |
71 | /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV }, | |
72 | /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, | |
73 | /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV }, | |
74 | /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV }, | |
75 | }; | |
76 | ||
77 | static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = { | |
78 | /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */ | |
79 | /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA}, | |
80 | /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA}, | |
81 | /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA}, | |
82 | /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR }, | |
83 | /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR }, | |
84 | /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2}, | |
85 | /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, | |
86 | /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR }, | |
87 | /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR }, | |
88 | /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR }, | |
89 | }; | |
90 | ||
91 | ||
92 | char *literal_codeset = NULL; | |
93 | ||
94 | int | |
95 | local_mbtowc (pwc, s, n) | |
96 | wchar_t *pwc; | |
97 | const char *s; | |
98 | size_t n; | |
99 | { | |
100 | static JIS_STATE save_state = ASCII; | |
101 | JIS_STATE curr_state = save_state; | |
102 | unsigned char *t = (unsigned char *)s; | |
103 | ||
104 | if (s != NULL && n == 0) | |
105 | return -1; | |
106 | ||
107 | if (literal_codeset == NULL || strlen (literal_codeset) <= 1) | |
108 | { | |
109 | /* This must be the "C" locale or unknown locale -- fall thru */ | |
110 | } | |
111 | else if (! strcmp (literal_codeset, "C-SJIS")) | |
112 | { | |
113 | int char1; | |
114 | if (s == NULL) | |
115 | return 0; /* not state-dependent */ | |
116 | char1 = *t; | |
117 | if (ISSJIS1 (char1)) | |
118 | { | |
119 | int char2 = t[1]; | |
120 | if (n <= 1) | |
121 | return -1; | |
122 | if (ISSJIS2 (char2)) | |
123 | { | |
124 | if (pwc != NULL) | |
125 | *pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1)); | |
126 | return 2; | |
127 | } | |
128 | return -1; | |
129 | } | |
130 | if (pwc != NULL) | |
131 | *pwc = (wchar_t)*t; | |
132 | if (*t == '\0') | |
133 | return 0; | |
134 | return 1; | |
135 | } | |
136 | else if (! strcmp (literal_codeset, "C-EUCJP")) | |
137 | { | |
138 | int char1; | |
139 | if (s == NULL) | |
140 | return 0; /* not state-dependent */ | |
141 | char1 = *t; | |
142 | if (ISEUCJP (char1)) | |
143 | { | |
144 | int char2 = t[1]; | |
145 | if (n <= 1) | |
146 | return -1; | |
147 | if (ISEUCJP (char2)) | |
148 | { | |
149 | if (pwc != NULL) | |
150 | *pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1)); | |
151 | return 2; | |
152 | } | |
153 | return -1; | |
154 | } | |
155 | if (pwc != NULL) | |
156 | *pwc = (wchar_t)*t; | |
157 | if (*t == '\0') | |
158 | return 0; | |
159 | return 1; | |
160 | } | |
161 | else if (! strcmp (literal_codeset, "C-JIS")) | |
162 | { | |
163 | JIS_ACTION action; | |
164 | JIS_CHAR_TYPE ch; | |
165 | unsigned char *ptr; | |
166 | int i, curr_ch; | |
167 | ||
168 | if (s == NULL) | |
169 | { | |
170 | save_state = ASCII; | |
171 | return 1; /* state-dependent */ | |
172 | } | |
173 | ||
174 | ptr = t; | |
175 | ||
176 | for (i = 0; i < n; ++i) | |
177 | { | |
178 | curr_ch = t[i]; | |
179 | switch (curr_ch) | |
180 | { | |
181 | case JIS_ESC_CHAR: | |
182 | ch = ESCAPE; | |
183 | break; | |
184 | case '$': | |
185 | ch = DOLLAR; | |
186 | break; | |
187 | case '@': | |
188 | ch = AT; | |
189 | break; | |
190 | case '(': | |
191 | ch = BRACKET; | |
192 | break; | |
193 | case 'B': | |
194 | ch = B; | |
195 | break; | |
196 | case 'J': | |
197 | ch = J; | |
198 | break; | |
199 | case '\0': | |
200 | ch = NUL; | |
201 | break; | |
202 | default: | |
203 | if (ISJIS (curr_ch)) | |
204 | ch = JIS_CHAR; | |
205 | else | |
206 | ch = OTHER; | |
207 | } | |
208 | ||
209 | action = JIS_action_table[curr_state][ch]; | |
210 | curr_state = JIS_state_table[curr_state][ch]; | |
211 | ||
212 | switch (action) | |
213 | { | |
214 | case NOOP: | |
215 | break; | |
216 | case EMPTY: | |
217 | if (pwc != NULL) | |
218 | *pwc = (wchar_t)0; | |
219 | save_state = curr_state; | |
220 | return i; | |
221 | case COPYA: | |
222 | if (pwc != NULL) | |
223 | *pwc = (wchar_t)*ptr; | |
224 | save_state = curr_state; | |
225 | return (i + 1); | |
226 | case COPYJ: | |
227 | if (pwc != NULL) | |
228 | *pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1)); | |
229 | save_state = curr_state; | |
230 | return (i + 1); | |
231 | case COPYJ2: | |
232 | if (pwc != NULL) | |
233 | *pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1)); | |
234 | save_state = curr_state; | |
235 | return (ptr - t) + 2; | |
236 | case MAKE_A: | |
237 | case MAKE_J: | |
238 | ptr = (char *)(t + i + 1); | |
239 | break; | |
240 | case ERROR: | |
241 | default: | |
242 | return -1; | |
243 | } | |
244 | } | |
245 | ||
246 | return -1; /* n < bytes needed */ | |
247 | } | |
248 | ||
249 | #ifdef CROSS_COMPILE | |
250 | if (s == NULL) | |
251 | return 0; /* not state-dependent */ | |
252 | if (pwc != NULL) | |
253 | *pwc = *s; | |
254 | return 1; | |
255 | #else | |
256 | /* This must be the "C" locale or unknown locale. */ | |
257 | return mbtowc (pwc, s, n); | |
258 | #endif | |
259 | } | |
260 | ||
261 | int | |
262 | local_mblen (s, n) | |
263 | const char *s; | |
264 | size_t n; | |
265 | { | |
266 | return local_mbtowc (NULL, s, n); | |
267 | } | |
268 | ||
269 | int | |
270 | local_mb_cur_max () | |
271 | { | |
272 | if (literal_codeset == NULL || strlen (literal_codeset) <= 1) | |
273 | ; | |
274 | else if (! strcmp (literal_codeset, "C-SJIS")) | |
275 | return 2; | |
276 | else if (! strcmp (literal_codeset, "C-EUCJP")) | |
277 | return 2; | |
278 | else if (! strcmp (literal_codeset, "C-JIS")) | |
279 | return 8; /* 3 + 2 + 3 */ | |
280 | ||
281 | #ifdef CROSS_COMPILE | |
282 | return 1; | |
283 | #else | |
4d2a3f76 DB |
284 | if (MB_CUR_MAX > 0) |
285 | return MB_CUR_MAX; | |
286 | ||
287 | return 1; /* default */ | |
56f48ce9 DB |
288 | #endif |
289 | } | |
290 | #endif /* MULTIBYTE_CHARS */ |