Ruby  2.5.0dev(2017-10-22revision60238)
nkf.c
Go to the documentation of this file.
1 /*
2  * NKF - Ruby extension for Network Kanji Filter
3  *
4  * original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
5  *
6  * $Id$
7  *
8  */
9 
10 #define RUBY_NKF_REVISION "$Revision$"
11 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
12 
13 #include "ruby/ruby.h"
14 #include "ruby/encoding.h"
15 
16 /* Replace nkf's getchar/putchar for variable modification */
17 /* we never use getc, ungetc */
18 
19 #undef getc
20 #undef ungetc
21 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++])
22 #define ungetc(c,f) input_ctr--
23 
24 #define INCSIZE 32
25 #undef putchar
26 #undef TRUE
27 #undef FALSE
28 #define putchar(c) rb_nkf_putchar(c)
29 
30 /* Input/Output pointers */
31 
32 static unsigned char *output;
33 static unsigned char *input;
34 static int input_ctr;
35 static int i_len;
36 static int output_ctr;
37 static int o_len;
38 static int incsize;
39 
40 static VALUE result;
41 
42 static int
43 rb_nkf_putchar(unsigned int c)
44 {
45  if (output_ctr >= o_len) {
46  o_len += incsize;
47  rb_str_resize(result, o_len);
48  incsize *= 2;
49  output = (unsigned char *)RSTRING_PTR(result);
50  }
51  output[output_ctr++] = c;
52 
53  return c;
54 }
55 
56 /* Include kanji filter main part */
57 /* getchar and putchar will be replaced during inclusion */
58 
59 #define PERL_XS 1
60 #include "nkf-utf8/config.h"
61 #include "nkf-utf8/utf8tbl.c"
62 #include "nkf-utf8/nkf.c"
63 
65 {
66  int idx = rb_enc_find_index(name);
67  if (idx < 0) {
68  nkf_encoding *nkf_enc = nkf_enc_find(name);
70  if (idx < 0) {
71  idx = rb_define_dummy_encoding(name);
72  }
73  }
74  return rb_enc_from_index(idx);
75 }
76 
77 int nkf_split_options(const char *arg)
78 {
79  int count = 0;
80  unsigned char option[256];
81  int i = 0, j = 0;
82  int is_escaped = FALSE;
83  int is_single_quoted = FALSE;
84  int is_double_quoted = FALSE;
85  for(i = 0; arg[i]; i++){
86  if(j == 255){
87  return -1;
88  }else if(is_single_quoted){
89  if(arg[i] == '\''){
90  is_single_quoted = FALSE;
91  }else{
92  option[j++] = arg[i];
93  }
94  }else if(is_escaped){
95  is_escaped = FALSE;
96  option[j++] = arg[i];
97  }else if(arg[i] == '\\'){
98  is_escaped = TRUE;
99  }else if(is_double_quoted){
100  if(arg[i] == '"'){
101  is_double_quoted = FALSE;
102  }else{
103  option[j++] = arg[i];
104  }
105  }else if(arg[i] == '\''){
106  is_single_quoted = TRUE;
107  }else if(arg[i] == '"'){
108  is_double_quoted = TRUE;
109  }else if(arg[i] == ' '){
110  option[j] = '\0';
111  options(option);
112  j = 0;
113  }else{
114  option[j++] = arg[i];
115  }
116  }
117  if(j){
118  option[j] = '\0';
119  options(option);
120  }
121  return count;
122 }
123 
124 /*
125  * call-seq:
126  * NKF.nkf(opt, str) => string
127  *
128  * Convert _str_ and return converted result.
129  * Conversion details are specified by _opt_ as String.
130  *
131  * require 'nkf'
132  * output = NKF.nkf("-s", input)
133  */
134 
135 static VALUE
136 rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
137 {
138  VALUE tmp;
139  reinit();
141  if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
142 
143  switch (nkf_enc_to_index(output_encoding)) {
144  case UTF_8_BOM: output_encoding = nkf_enc_from_index(UTF_8); break;
145  case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break;
146  case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break;
147  case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break;
148  case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break;
149  }
150  output_bom_f = FALSE;
151 
152  incsize = INCSIZE;
153 
154  input_ctr = 0;
155  input = (unsigned char *)StringValuePtr(src);
156  i_len = RSTRING_LENINT(src);
157  tmp = rb_str_new(0, i_len*3 + 10);
158 
159  output_ctr = 0;
160  output = (unsigned char *)RSTRING_PTR(tmp);
161  o_len = RSTRING_LENINT(tmp);
162  *output = '\0';
163 
164  /* use _result_ begin*/
165  result = tmp;
166  kanji_convert(NULL);
167  result = Qnil;
168  /* use _result_ end */
169 
170  rb_str_set_len(tmp, output_ctr);
171  OBJ_INFECT(tmp, src);
172 
173  if (mimeout_f)
175  else
176  rb_enc_associate(tmp, rb_nkf_enc_get(nkf_enc_name(output_encoding)));
177 
178  return tmp;
179 }
180 
181 
182 /*
183  * call-seq:
184  * NKF.guess(str) => encoding
185  *
186  * Returns guessed encoding of _str_ by nkf routine.
187  *
188  */
189 
190 static VALUE
191 rb_nkf_guess(VALUE obj, VALUE src)
192 {
193  reinit();
194 
195  input_ctr = 0;
196  input = (unsigned char *)StringValuePtr(src);
197  i_len = RSTRING_LENINT(src);
198 
199  guess_f = TRUE;
200  kanji_convert( NULL );
201  guess_f = FALSE;
202 
203  return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code()));
204 }
205 
206 
207 /*
208  * NKF - Ruby extension for Network Kanji Filter
209  *
210  * == Description
211  *
212  * This is a Ruby Extension version of nkf (Network Kanji Filter).
213  * It converts the first argument and returns converted result. Conversion
214  * details are specified by flags as the first argument.
215  *
216  * *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
217  * It converts input kanji code to designated kanji code
218  * such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16.
219  *
220  * One of the most unique faculty of *nkf* is the guess of the input kanji encodings.
221  * It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16.
222  * So users needn't set the input kanji code explicitly.
223  *
224  * By default, X0201 kana is converted into X0208 kana.
225  * For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
226  * For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
227  * To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>.
228  *
229  * == Flags
230  *
231  * === -b -u
232  *
233  * Output is buffered (DEFAULT), Output is unbuffered.
234  *
235  * === -j -s -e -w -w16 -w32
236  *
237  * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
238  * UTF-8N, UTF-16BE, UTF-32BE.
239  * Without this option and compile option, ISO-2022-JP is assumed.
240  *
241  * === -J -S -E -W -W16 -W32
242  *
243  * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
244  * UTF-8, UTF-16, UTF-32.
245  *
246  * ==== -J
247  *
248  * Assume JIS input. It also accepts EUC-JP.
249  * This is the default. This flag does not exclude Shift_JIS.
250  *
251  * ==== -S
252  *
253  * Assume Shift_JIS and X0201 kana input. It also accepts JIS.
254  * EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag,
255  * X0201 kana (halfwidth kana) is converted into X0208.
256  *
257  * ==== -E
258  *
259  * Assume EUC-JP input. It also accepts JIS.
260  * Same as -J.
261  *
262  * === -t
263  *
264  * No conversion.
265  *
266  * === -i_
267  *
268  * Output sequence to designate JIS-kanji. (DEFAULT B)
269  *
270  * === -o_
271  *
272  * Output sequence to designate ASCII. (DEFAULT B)
273  *
274  * === -r
275  *
276  * {de/en}crypt ROT13/47
277  *
278  * === -h[123] --hiragana --katakana --katakana-hiragana
279  *
280  * [-h1 --hiragana] Katakana to Hiragana conversion.
281  *
282  * [-h2 --katakana] Hiragana to Katakana conversion.
283  *
284  * [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion.
285  *
286  * === -T
287  *
288  * Text mode output (MS-DOS)
289  *
290  * === -l
291  *
292  * ISO8859-1 (Latin-1) support
293  *
294  * === -f[<code>m</code> [- <code>n</code>]]
295  *
296  * Folding on <code>m</code> length with <code>n</code> margin in a line.
297  * Without this option, fold length is 60 and fold margin is 10.
298  *
299  * === -F
300  *
301  * New line preserving line folding.
302  *
303  * === -Z[0-3]
304  *
305  * Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
306  *
307  * [-Z -Z0] Convert X0208 alphabet to ASCII.
308  *
309  * [-Z1] Converts X0208 kankaku to single ASCII space.
310  *
311  * [-Z2] Converts X0208 kankaku to double ASCII spaces.
312  *
313  * [-Z3] Replacing Fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
314  *
315  * === -X -x
316  *
317  * Assume X0201 kana in MS-Kanji.
318  * With <b>-X</b> or without this option, X0201 is converted into X0208 Kana.
319  * With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
320  * In JIS output, ESC-(-I is used. In EUC output, SSO is used.
321  *
322  * === -B[0-2]
323  *
324  * Assume broken JIS-Kanji input, which lost ESC.
325  * Useful when your site is using old B-News Nihongo patch.
326  *
327  * [-B1] allows any char after ESC-( or ESC-$.
328  *
329  * [-B2] forces ASCII after NL.
330  *
331  * === -I
332  *
333  * Replacing non iso-2022-jp char into a geta character
334  * (substitute character in Japanese).
335  *
336  * === -d -c
337  *
338  * Delete \r in line feed, Add \r in line feed.
339  *
340  * === -m[BQN0]
341  *
342  * MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
343  * To see ISO8859-1 (Latin-1) -l is necessary.
344  *
345  * [-mB] Decode MIME base64 encoded stream. Remove header or other part before
346  * conversion.
347  *
348  * [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
349  *
350  * [-mN] Non-strict decoding.
351  * It allows line break in the middle of the base64 encoding.
352  *
353  * [-m0] No MIME decode.
354  *
355  * === -M
356  *
357  * MIME encode. Header style. All ASCII code and control characters are intact.
358  * Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
359  *
360  * [-MB] MIME encode Base64 stream.
361  *
362  * [-MQ] Perfome quoted encoding.
363  *
364  * === -l
365  *
366  * Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
367  * <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option.
368  *
369  * === -L[uwm]
370  *
371  * new line mode
372  * Without this option, nkf doesn't convert line breaks.
373  *
374  * [-Lu] unix (LF)
375  *
376  * [-Lw] windows (CRLF)
377  *
378  * [-Lm] mac (CR)
379  *
380  * === --fj --unix --mac --msdos --windows
381  *
382  * convert for these system
383  *
384  * === --jis --euc --sjis --mime --base64
385  *
386  * convert for named code
387  *
388  * === --jis-input --euc-input --sjis-input --mime-input --base64-input
389  *
390  * assume input system
391  *
392  * === --ic=<code>input codeset</code> --oc=<code>output codeset</code>
393  *
394  * Set the input or output codeset.
395  * NKF supports following codesets and those codeset name are case insensitive.
396  *
397  * [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET
398  *
399  * [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS
400  *
401  * [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii
402  *
403  * [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms
404  *
405  * [CP51932] Microsoft Version of EUC-JP.
406  *
407  * [Shift_JIS] SJIS, MS-Kanji
408  *
409  * [Windows-31J] a.k.a. CP932
410  *
411  * [UTF-8] same as UTF-8N
412  *
413  * [UTF-8N] UTF-8 without BOM
414  *
415  * [UTF-8-BOM] UTF-8 with BOM
416  *
417  * [UTF-16] same as UTF-16BE
418  *
419  * [UTF-16BE] UTF-16 Big Endian without BOM
420  *
421  * [UTF-16BE-BOM] UTF-16 Big Endian with BOM
422  *
423  * [UTF-16LE] UTF-16 Little Endian without BOM
424  *
425  * [UTF-16LE-BOM] UTF-16 Little Endian with BOM
426  *
427  * [UTF-32] same as UTF-32BE
428  *
429  * [UTF-32BE] UTF-32 Big Endian without BOM
430  *
431  * [UTF-32BE-BOM] UTF-32 Big Endian with BOM
432  *
433  * [UTF-32LE] UTF-32 Little Endian without BOM
434  *
435  * [UTF-32LE-BOM] UTF-32 Little Endian with BOM
436  *
437  * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
438  *
439  * === --fb-{skip, html, xml, perl, java, subchar}
440  *
441  * Specify the way that nkf handles unassigned characters.
442  * Without this option, --fb-skip is assumed.
443  *
444  * === --prefix= <code>escape character</code> <code>target character</code> ..
445  *
446  * When nkf converts to Shift_JIS,
447  * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
448  * 1st byte of argument is the escape character and following bytes are target characters.
449  *
450  * === --no-cp932ext
451  *
452  * Handle the characters extended in CP932 as unassigned characters.
453  *
454  * == --no-best-fit-chars
455  *
456  * When Unicode to Encoded byte conversion,
457  * don't convert characters which is not round trip safe.
458  * When Unicode to Unicode conversion,
459  * with this and -x option, nkf can be used as UTF converter.
460  * (In other words, without this and -x option, nkf doesn't save some characters)
461  *
462  * When nkf convert string which related to path, you should use this opion.
463  *
464  * === --cap-input
465  *
466  * Decode hex encoded characters.
467  *
468  * === --url-input
469  *
470  * Unescape percent escaped characters.
471  *
472  * === --
473  *
474  * Ignore rest of -option.
475  */
476 
477 void
478 Init_nkf(void)
479 {
480  VALUE mNKF = rb_define_module("NKF");
481 
482  rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
483  rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
484  rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
485 
486  rb_define_const(mNKF, "AUTO", Qnil);
487  rb_define_const(mNKF, "NOCONV", Qnil);
488  rb_define_const(mNKF, "UNKNOWN", Qnil);
489  rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
490  rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
491  rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
492  rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
493  rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
495  rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
496  rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
497 
498  /* Full version string of nkf */
499  rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
500  /* Version of nkf */
501  rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
502  /* Release date of nkf */
503  rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
504 }
#define FALSE
Definition: nkf.h:174
rb_encoding * rb_nkf_enc_get(const char *name)
Definition: nkf.c:64
#define NKF_RELEASE_DATE
Definition: nkf.c:24
void Init_nkf(void)
Definition: nkf.c:478
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2284
#define nkf_enc_name(enc)
Definition: nkf.c:758
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:117
Definition: nkf.c:115
void rb_str_set_len(VALUE, long)
Definition: string.c:2627
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:854
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1320
#define RUBY_NKF_VERSION
Definition: nkf.c:11
VALUE rb_eArgError
Definition: error.c:802
VALUE rb_singleton_class(VALUE obj)
Returns the singleton class of obj.
Definition: class.c:1689
#define NKF_VERSION
Definition: nkf.c:23
unsigned int input
Definition: nkf.c:4312
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2691
#define INCSIZE
Definition: nkf.c:24
#define rb_str_new2
Definition: intern.h:835
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2644
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:1758
void rb_define_module_function(VALUE module, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a module function for module.
Definition: class.c:1731
#define TRUE
Definition: nkf.h:175
#define nkf_enc_to_base_encoding(enc)
Definition: nkf.c:760
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1335
#define nkf_enc_to_index(enc)
Definition: nkf.c:759
#define Qnil
Definition: ruby.h:438
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:466
unsigned long VALUE
Definition: ruby.h:85
#define StringValueCStr(v)
Definition: ruby.h:571
int count
Definition: nkf.c:5042
#define RSTRING_PTR(str)
Definition: ruby.h:975
#define OBJ_INFECT(x, s)
Definition: ruby.h:1302
Definition: nkf.c:110
Definition: nkf.c:113
const char * name
Definition: nkf.c:208
#define StringValuePtr(v)
Definition: ruby.h:570
int rb_enc_find_index(const char *name)
Definition: encoding.c:704
#define RSTRING_LENINT(str)
Definition: ruby.h:983
Definition: nkf.c:108
VALUE rb_define_module(const char *name)
Definition: class.c:768
int nkf_split_options(const char *arg)
Definition: nkf.c:77
#define NULL
Definition: _sdbm.c:102
Definition: nkf.c:118
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:616
Definition: nkf.c:120
VALUE rb_str_new(const char *, long)
Definition: string.c:737