Ruby  2.5.0dev(2017-10-22revision60238)
transcode.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author$
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "internal.h"
13 #include "transcode_data.h"
14 #include <ctype.h>
15 
16 #define ENABLE_ECONV_NEWLINE_OPTION 1
17 
18 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
22 
24 
25 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref;
26 static VALUE sym_xml, sym_text, sym_attr;
27 static VALUE sym_universal_newline;
28 static VALUE sym_crlf_newline;
29 static VALUE sym_cr_newline;
30 #ifdef ENABLE_ECONV_NEWLINE_OPTION
31 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
32 #endif
33 static VALUE sym_partial_input;
34 
35 static VALUE sym_invalid_byte_sequence;
36 static VALUE sym_undefined_conversion;
37 static VALUE sym_destination_buffer_full;
38 static VALUE sym_source_buffer_empty;
39 static VALUE sym_finished;
40 static VALUE sym_after_output;
41 static VALUE sym_incomplete_input;
42 
43 static unsigned char *
44 allocate_converted_string(const char *sname, const char *dname,
45  const unsigned char *str, size_t len,
46  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
47  size_t *dst_len_ptr);
48 
49 /* dynamic structure, one per conversion (similar to iconv_t) */
50 /* may carry conversion state (e.g. for iso-2022-jp) */
51 typedef struct rb_transcoding {
53 
54  int flags;
55 
57  unsigned int next_table;
59  unsigned char next_byte;
60  unsigned int output_index;
61 
62  ssize_t recognized_len; /* already interpreted */
63  ssize_t readagain_len; /* not yet interpreted */
64  union {
65  unsigned char ary[8]; /* max_input <= sizeof(ary) */
66  unsigned char *ptr; /* length: max_input */
67  } readbuf; /* recognized_len + readagain_len used */
68 
69  ssize_t writebuf_off;
70  ssize_t writebuf_len;
71  union {
72  unsigned char ary[8]; /* max_output <= sizeof(ary) */
73  unsigned char *ptr; /* length: max_output */
74  } writebuf;
75 
76  union rb_transcoding_state_t { /* opaque data for stateful encoding */
77  void *ptr;
78  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
80  } state;
82 #define TRANSCODING_READBUF(tc) \
83  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
84  (tc)->readbuf.ary : \
85  (tc)->readbuf.ptr)
86 #define TRANSCODING_WRITEBUF(tc) \
87  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
88  (tc)->writebuf.ary : \
89  (tc)->writebuf.ptr)
90 #define TRANSCODING_WRITEBUF_SIZE(tc) \
91  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
92  sizeof((tc)->writebuf.ary) : \
93  (size_t)(tc)->transcoder->max_output)
94 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
95 #define TRANSCODING_STATE(tc) \
96  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
97  (tc)->state.ary : \
98  (tc)->state.ptr)
99 
100 typedef struct {
102  unsigned char *out_buf_start;
103  unsigned char *out_data_start;
104  unsigned char *out_data_end;
105  unsigned char *out_buf_end;
108 
109 struct rb_econv_t {
110  int flags;
111  int started; /* bool */
112 
113  const char *source_encoding_name;
115 
116  const unsigned char *replacement_str;
118  const char *replacement_enc;
119 
120  unsigned char *in_buf_start;
121  unsigned char *in_data_start;
122  unsigned char *in_data_end;
123  unsigned char *in_buf_end;
125  int replacement_allocated; /* bool */
130 
131  /* last error */
132  struct {
135  const char *source_encoding;
136  const char *destination_encoding;
137  const unsigned char *error_bytes_start;
140  } last_error;
141 
142  /* The following fields are only for Encoding::Converter.
143  * rb_econv_open set them NULL. */
146 };
147 
148 /*
149  * Dispatch data and logic
150  */
151 
152 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
153 
154 typedef struct {
155  const char *sname;
156  const char *dname;
157  const char *lib; /* null means no need to load a library */
160 
161 static st_table *transcoder_table;
162 
163 static transcoder_entry_t *
164 make_transcoder_entry(const char *sname, const char *dname)
165 {
166  st_data_t val;
167  st_table *table2;
168 
169  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
171  st_add_direct(transcoder_table, (st_data_t)sname, val);
172  }
173  table2 = (st_table *)val;
174  if (!st_lookup(table2, (st_data_t)dname, &val)) {
176  entry->sname = sname;
177  entry->dname = dname;
178  entry->lib = NULL;
179  entry->transcoder = NULL;
180  val = (st_data_t)entry;
181  st_add_direct(table2, (st_data_t)dname, val);
182  }
183  return (transcoder_entry_t *)val;
184 }
185 
186 static transcoder_entry_t *
187 get_transcoder_entry(const char *sname, const char *dname)
188 {
189  st_data_t val;
190  st_table *table2;
191 
192  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
193  return NULL;
194  }
195  table2 = (st_table *)val;
196  if (!st_lookup(table2, (st_data_t)dname, &val)) {
197  return NULL;
198  }
199  return (transcoder_entry_t *)val;
200 }
201 
202 void
204 {
205  const char *const sname = tr->src_encoding;
206  const char *const dname = tr->dst_encoding;
207 
208  transcoder_entry_t *entry;
209 
210  entry = make_transcoder_entry(sname, dname);
211  if (entry->transcoder) {
212  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
213  sname, dname);
214  }
215 
216  entry->transcoder = tr;
217 }
218 
219 static void
220 declare_transcoder(const char *sname, const char *dname, const char *lib)
221 {
222  transcoder_entry_t *entry;
223 
224  entry = make_transcoder_entry(sname, dname);
225  entry->lib = lib;
226 }
227 
228 static const char transcoder_lib_prefix[] = "enc/trans/";
229 
230 void
231 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
232 {
233  if (!lib) {
234  rb_raise(rb_eArgError, "invalid library name - (null)");
235  }
236  declare_transcoder(enc1, enc2, lib);
237 }
238 
239 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
240 
241 typedef struct search_path_queue_tag {
243  const char *enc;
245 
246 typedef struct {
250  const char *base_enc;
252 
253 static int
254 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
255 {
256  const char *dname = (const char *)key;
257  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
259 
260  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
261  return ST_CONTINUE;
262  }
263 
265  q->enc = dname;
266  q->next = NULL;
267  *bfs->queue_last_ptr = q;
268  bfs->queue_last_ptr = &q->next;
269 
270  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
271  return ST_CONTINUE;
272 }
273 
274 static int
275 transcode_search_path(const char *sname, const char *dname,
276  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
277  void *arg)
278 {
279  search_path_bfs_t bfs;
281  st_data_t val;
282  st_table *table2;
283  int found;
284  int pathlen = -1;
285 
286  if (encoding_equal(sname, dname))
287  return -1;
288 
290  q->enc = sname;
291  q->next = NULL;
292  bfs.queue_last_ptr = &q->next;
293  bfs.queue = q;
294 
297 
298  while (bfs.queue) {
299  q = bfs.queue;
300  bfs.queue = q->next;
301  if (!bfs.queue)
302  bfs.queue_last_ptr = &bfs.queue;
303 
304  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
305  xfree(q);
306  continue;
307  }
308  table2 = (st_table *)val;
309 
310  if (st_lookup(table2, (st_data_t)dname, &val)) {
311  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
312  xfree(q);
313  found = 1;
314  goto cleanup;
315  }
316 
317  bfs.base_enc = q->enc;
318  st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
319  bfs.base_enc = NULL;
320 
321  xfree(q);
322  }
323  found = 0;
324 
325  cleanup:
326  while (bfs.queue) {
327  q = bfs.queue;
328  bfs.queue = q->next;
329  xfree(q);
330  }
331 
332  if (found) {
333  const char *enc = dname;
334  int depth;
335  pathlen = 0;
336  while (1) {
337  st_lookup(bfs.visited, (st_data_t)enc, &val);
338  if (!val)
339  break;
340  pathlen++;
341  enc = (const char *)val;
342  }
343  depth = pathlen;
344  enc = dname;
345  while (1) {
346  st_lookup(bfs.visited, (st_data_t)enc, &val);
347  if (!val)
348  break;
349  callback((const char *)val, enc, --depth, arg);
350  enc = (const char *)val;
351  }
352  }
353 
354  st_free_table(bfs.visited);
355 
356  return pathlen; /* is -1 if not found */
357 }
358 
359 static const rb_transcoder *
360 load_transcoder_entry(transcoder_entry_t *entry)
361 {
362  if (entry->transcoder)
363  return entry->transcoder;
364 
365  if (entry->lib) {
366  const char *const lib = entry->lib;
367  const size_t len = strlen(lib);
368  const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
369  const VALUE fn = rb_str_new(0, total_len);
370  char *const path = RSTRING_PTR(fn);
371  const int safe = rb_safe_level();
372 
373  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
374  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
375  rb_str_set_len(fn, total_len);
376  FL_UNSET(fn, FL_TAINT);
377  OBJ_FREEZE(fn);
378  rb_require_safe(fn, safe > 3 ? 3 : safe);
379  }
380 
381  if (entry->transcoder)
382  return entry->transcoder;
383 
384  return NULL;
385 }
386 
387 static const char*
388 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
389 {
390  if (encoding_equal(encname, "UTF-8")) {
391  *len_ret = 3;
392  *repl_encname_ptr = "UTF-8";
393  return "\xEF\xBF\xBD";
394  }
395  else {
396  *len_ret = 1;
397  *repl_encname_ptr = "US-ASCII";
398  return "?";
399  }
400 }
401 
402 /*
403  * Transcoding engine logic
404  */
405 
406 static const unsigned char *
407 transcode_char_start(rb_transcoding *tc,
408  const unsigned char *in_start,
409  const unsigned char *inchar_start,
410  const unsigned char *in_p,
411  size_t *char_len_ptr)
412 {
413  const unsigned char *ptr;
414  if (inchar_start - in_start < tc->recognized_len) {
416  inchar_start, unsigned char, in_p - inchar_start);
417  ptr = TRANSCODING_READBUF(tc);
418  }
419  else {
420  ptr = inchar_start - tc->recognized_len;
421  }
422  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
423  return ptr;
424 }
425 
426 static rb_econv_result_t
427 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
428  const unsigned char *in_stop, unsigned char *out_stop,
429  rb_transcoding *tc,
430  const int opt)
431 {
432  const rb_transcoder *tr = tc->transcoder;
433  int unitlen = tr->input_unit_length;
434  ssize_t readagain_len = 0;
435 
436  const unsigned char *inchar_start;
437  const unsigned char *in_p;
438 
439  unsigned char *out_p;
440 
441  in_p = inchar_start = *in_pos;
442 
443  out_p = *out_pos;
444 
445 #define SUSPEND(ret, num) \
446  do { \
447  tc->resume_position = (num); \
448  if (0 < in_p - inchar_start) \
449  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
450  inchar_start, unsigned char, in_p - inchar_start); \
451  *in_pos = in_p; \
452  *out_pos = out_p; \
453  tc->recognized_len += in_p - inchar_start; \
454  if (readagain_len) { \
455  tc->recognized_len -= readagain_len; \
456  tc->readagain_len = readagain_len; \
457  } \
458  return (ret); \
459  resume_label ## num:; \
460  } while (0)
461 #define SUSPEND_OBUF(num) \
462  do { \
463  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
464  } while (0)
465 
466 #define SUSPEND_AFTER_OUTPUT(num) \
467  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
468  SUSPEND(econv_after_output, num); \
469  }
470 
471 #define next_table (tc->next_table)
472 #define next_info (tc->next_info)
473 #define next_byte (tc->next_byte)
474 #define writebuf_len (tc->writebuf_len)
475 #define writebuf_off (tc->writebuf_off)
476 
477  switch (tc->resume_position) {
478  case 0: break;
479  case 1: goto resume_label1;
480  case 2: goto resume_label2;
481  case 3: goto resume_label3;
482  case 4: goto resume_label4;
483  case 5: goto resume_label5;
484  case 6: goto resume_label6;
485  case 7: goto resume_label7;
486  case 8: goto resume_label8;
487  case 9: goto resume_label9;
488  case 10: goto resume_label10;
489  case 11: goto resume_label11;
490  case 12: goto resume_label12;
491  case 13: goto resume_label13;
492  case 14: goto resume_label14;
493  case 15: goto resume_label15;
494  case 16: goto resume_label16;
495  case 17: goto resume_label17;
496  case 18: goto resume_label18;
497  case 19: goto resume_label19;
498  case 20: goto resume_label20;
499  case 21: goto resume_label21;
500  case 22: goto resume_label22;
501  case 23: goto resume_label23;
502  case 24: goto resume_label24;
503  case 25: goto resume_label25;
504  case 26: goto resume_label26;
505  case 27: goto resume_label27;
506  case 28: goto resume_label28;
507  case 29: goto resume_label29;
508  case 30: goto resume_label30;
509  case 31: goto resume_label31;
510  case 32: goto resume_label32;
511  case 33: goto resume_label33;
512  case 34: goto resume_label34;
513  }
514 
515  while (1) {
516  inchar_start = in_p;
517  tc->recognized_len = 0;
518  next_table = tr->conv_tree_start;
519 
521 
522  if (in_stop <= in_p) {
523  if (!(opt & ECONV_PARTIAL_INPUT))
524  break;
526  continue;
527  }
528 
529 #define BYTE_ADDR(index) (tr->byte_array + (index))
530 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
531 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
532 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
533 #define BL_MIN_BYTE (BL_BASE[0])
534 #define BL_MAX_BYTE (BL_BASE[1])
535 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
536 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
537 
538  next_byte = (unsigned char)*in_p++;
539  follow_byte:
540  if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
541  next_info = INVALID;
542  else {
543  next_info = (VALUE)BL_ACTION(next_byte);
544  }
545  follow_info:
546  switch (next_info & 0x1F) {
547  case NOMAP:
548  {
549  const unsigned char *p = inchar_start;
550  writebuf_off = 0;
551  while (p < in_p) {
552  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
553  }
554  writebuf_len = writebuf_off;
555  writebuf_off = 0;
556  while (writebuf_off < writebuf_len) {
557  SUSPEND_OBUF(3);
558  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
559  }
560  }
561  continue;
562  case 0x00: case 0x04: case 0x08: case 0x0C:
563  case 0x10: case 0x14: case 0x18: case 0x1C:
565  while (in_p >= in_stop) {
566  if (!(opt & ECONV_PARTIAL_INPUT))
567  goto incomplete;
569  }
570  next_byte = (unsigned char)*in_p++;
571  next_table = (unsigned int)next_info;
572  goto follow_byte;
573  case ZERObt: /* drop input */
574  continue;
575  case ONEbt:
576  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
577  continue;
578  case TWObt:
579  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
580  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
581  continue;
582  case THREEbt:
583  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
584  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
585  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
586  continue;
587  case FOURbt:
588  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
589  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
590  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
591  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
592  continue;
593  case GB4bt:
594  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
595  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
596  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
597  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
598  continue;
599  case STR1:
600  tc->output_index = 0;
601  while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
602  SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
603  tc->output_index++;
604  }
605  continue;
606  case FUNii:
607  next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
608  goto follow_info;
609  case FUNsi:
610  {
611  const unsigned char *char_start;
612  size_t char_len;
613  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
614  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
615  goto follow_info;
616  }
617  case FUNio:
618  SUSPEND_OBUF(13);
619  if (tr->max_output <= out_stop - out_p)
620  out_p += tr->func_io(TRANSCODING_STATE(tc),
621  next_info, out_p, out_stop - out_p);
622  else {
623  writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
624  next_info,
626  writebuf_off = 0;
627  while (writebuf_off < writebuf_len) {
628  SUSPEND_OBUF(20);
629  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
630  }
631  }
632  break;
633  case FUNso:
634  {
635  const unsigned char *char_start;
636  size_t char_len;
637  SUSPEND_OBUF(14);
638  if (tr->max_output <= out_stop - out_p) {
639  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
640  out_p += tr->func_so(TRANSCODING_STATE(tc),
641  char_start, (size_t)char_len,
642  out_p, out_stop - out_p);
643  }
644  else {
645  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
646  writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
647  char_start, (size_t)char_len,
649  writebuf_off = 0;
650  while (writebuf_off < writebuf_len) {
651  SUSPEND_OBUF(22);
652  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
653  }
654  }
655  break;
656  }
657  case FUNsio:
658  {
659  const unsigned char *char_start;
660  size_t char_len;
661  SUSPEND_OBUF(33);
662  if (tr->max_output <= out_stop - out_p) {
663  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
664  out_p += tr->func_sio(TRANSCODING_STATE(tc),
665  char_start, (size_t)char_len, next_info,
666  out_p, out_stop - out_p);
667  }
668  else {
669  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
670  writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
671  char_start, (size_t)char_len, next_info,
673  writebuf_off = 0;
674  while (writebuf_off < writebuf_len) {
675  SUSPEND_OBUF(34);
676  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
677  }
678  }
679  break;
680  }
681  case INVALID:
682  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
683  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
685  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
686  in_p = in_stop;
688  }
689  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
690  in_p = in_stop;
691  }
692  else {
693  in_p = inchar_start + (unitlen - tc->recognized_len);
694  }
695  }
696  else {
697  ssize_t invalid_len; /* including the last byte which causes invalid */
698  ssize_t discard_len;
699  invalid_len = tc->recognized_len + (in_p - inchar_start);
700  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
701  readagain_len = invalid_len - discard_len;
702  }
703  goto invalid;
704  case UNDEF:
705  goto undef;
706  default:
707  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
708  }
709  continue;
710 
711  invalid:
713  continue;
714 
715  incomplete:
717  continue;
718 
719  undef:
721  continue;
722  }
723 
724  /* cleanup */
725  if (tr->finish_func) {
726  SUSPEND_OBUF(4);
727  if (tr->max_output <= out_stop - out_p) {
728  out_p += tr->finish_func(TRANSCODING_STATE(tc),
729  out_p, out_stop - out_p);
730  }
731  else {
732  writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
734  writebuf_off = 0;
735  while (writebuf_off < writebuf_len) {
736  SUSPEND_OBUF(23);
737  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
738  }
739  }
740  }
741  while (1)
743 #undef SUSPEND
744 #undef next_table
745 #undef next_info
746 #undef next_byte
747 #undef writebuf_len
748 #undef writebuf_off
749 }
750 
751 static rb_econv_result_t
752 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
753  const unsigned char *in_stop, unsigned char *out_stop,
754  rb_transcoding *tc,
755  const int opt)
756 {
757  if (tc->readagain_len) {
758  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
759  const unsigned char *readagain_pos = readagain_buf;
760  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
761  rb_econv_result_t res;
762 
763  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
764  unsigned char, tc->readagain_len);
765  tc->readagain_len = 0;
766  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
767  if (res != econv_source_buffer_empty) {
769  readagain_pos, unsigned char, readagain_stop - readagain_pos);
770  tc->readagain_len += readagain_stop - readagain_pos;
771  return res;
772  }
773  }
774  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
775 }
776 
777 static rb_transcoding *
778 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
779 {
780  rb_transcoding *tc;
781 
782  tc = ALLOC(rb_transcoding);
783  tc->transcoder = tr;
784  tc->flags = flags;
785  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
786  tc->state.ptr = xmalloc(tr->state_size);
787  if (tr->state_init_func) {
788  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
789  }
790  tc->resume_position = 0;
791  tc->recognized_len = 0;
792  tc->readagain_len = 0;
793  tc->writebuf_len = 0;
794  tc->writebuf_off = 0;
795  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
796  tc->readbuf.ptr = xmalloc(tr->max_input);
797  }
798  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
799  tc->writebuf.ptr = xmalloc(tr->max_output);
800  }
801  return tc;
802 }
803 
804 static rb_econv_result_t
805 rb_transcoding_convert(rb_transcoding *tc,
806  const unsigned char **input_ptr, const unsigned char *input_stop,
807  unsigned char **output_ptr, unsigned char *output_stop,
808  int flags)
809 {
810  return transcode_restartable(
811  input_ptr, output_ptr,
812  input_stop, output_stop,
813  tc, flags);
814 }
815 
816 static void
817 rb_transcoding_close(rb_transcoding *tc)
818 {
819  const rb_transcoder *tr = tc->transcoder;
820  if (tr->state_fini_func) {
821  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
822  }
823  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
824  xfree(tc->state.ptr);
825  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
826  xfree(tc->readbuf.ptr);
827  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
828  xfree(tc->writebuf.ptr);
829  xfree(tc);
830 }
831 
832 static size_t
833 rb_transcoding_memsize(rb_transcoding *tc)
834 {
835  size_t size = sizeof(rb_transcoding);
836  const rb_transcoder *tr = tc->transcoder;
837 
838  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
839  size += tr->state_size;
840  }
841  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
842  size += tr->max_input;
843  }
844  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
845  size += tr->max_output;
846  }
847  return size;
848 }
849 
850 static rb_econv_t *
851 rb_econv_alloc(int n_hint)
852 {
853  rb_econv_t *ec;
854 
855  if (n_hint <= 0)
856  n_hint = 1;
857 
858  ec = ALLOC(rb_econv_t);
859  ec->flags = 0;
862  ec->started = 0;
863  ec->replacement_str = NULL;
864  ec->replacement_len = 0;
865  ec->replacement_enc = NULL;
866  ec->replacement_allocated = 0;
867  ec->in_buf_start = NULL;
868  ec->in_data_start = NULL;
869  ec->in_data_end = NULL;
870  ec->in_buf_end = NULL;
871  ec->num_allocated = n_hint;
872  ec->num_trans = 0;
874  ec->num_finished = 0;
875  ec->last_tc = NULL;
877  ec->last_error.error_tc = NULL;
881  ec->last_error.error_bytes_len = 0;
882  ec->last_error.readagain_len = 0;
883  ec->source_encoding = NULL;
885  return ec;
886 }
887 
888 static int
889 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
890 {
891  int n, j;
892  int bufsize = 4096;
893  unsigned char *p;
894 
895  if (ec->num_trans == ec->num_allocated) {
896  n = ec->num_allocated * 2;
898  ec->num_allocated = n;
899  }
900 
901  p = xmalloc(bufsize);
902 
903  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
904 
905  ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
906  ec->elems[i].out_buf_start = p;
907  ec->elems[i].out_buf_end = p + bufsize;
908  ec->elems[i].out_data_start = p;
909  ec->elems[i].out_data_end = p;
911 
912  ec->num_trans++;
913 
914  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
915  for (j = ec->num_trans-1; i <= j; j--) {
916  rb_transcoding *tc = ec->elems[j].tc;
917  const rb_transcoder *tr2 = tc->transcoder;
918  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
919  ec->last_tc = tc;
920  break;
921  }
922  }
923 
924  return 0;
925 }
926 
927 static rb_econv_t *
928 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
929 {
930  rb_econv_t *ec;
931  int i, ret;
932 
933  for (i = 0; i < n; i++) {
934  const rb_transcoder *tr;
935  tr = load_transcoder_entry(entries[i]);
936  if (!tr)
937  return NULL;
938  }
939 
940  ec = rb_econv_alloc(n);
941 
942  for (i = 0; i < n; i++) {
943  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
944  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
945  if (ret == -1) {
946  rb_econv_close(ec);
947  return NULL;
948  }
949  }
950 
951  return ec;
952 }
953 
954 struct trans_open_t {
957 };
958 
959 static void
960 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
961 {
962  struct trans_open_t *toarg = arg;
963 
964  if (!toarg->entries) {
965  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
966  }
967  toarg->entries[depth] = get_transcoder_entry(sname, dname);
968 }
969 
970 static rb_econv_t *
971 rb_econv_open0(const char *sname, const char *dname, int ecflags)
972 {
973  transcoder_entry_t **entries = NULL;
974  int num_trans;
975  rb_econv_t *ec;
976 
977  int sidx, didx;
978 
979  if (*sname) {
980  sidx = rb_enc_find_index(sname);
981  if (0 <= sidx) {
982  rb_enc_from_index(sidx);
983  }
984  }
985 
986  if (*dname) {
987  didx = rb_enc_find_index(dname);
988  if (0 <= didx) {
989  rb_enc_from_index(didx);
990  }
991  }
992 
993  if (*sname == '\0' && *dname == '\0') {
994  num_trans = 0;
995  entries = NULL;
996  sname = dname = "";
997  }
998  else {
999  struct trans_open_t toarg;
1000  toarg.entries = NULL;
1001  toarg.num_additional = 0;
1002  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1003  entries = toarg.entries;
1004  if (num_trans < 0) {
1005  xfree(entries);
1006  return NULL;
1007  }
1008  }
1009 
1010  ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1011  xfree(entries);
1012  if (!ec)
1013  return NULL;
1014 
1015  ec->flags = ecflags;
1016  ec->source_encoding_name = sname;
1017  ec->destination_encoding_name = dname;
1018 
1019  return ec;
1020 }
1021 
1022 #define MAX_ECFLAGS_DECORATORS 32
1023 
1024 static int
1025 decorator_names(int ecflags, const char **decorators_ret)
1026 {
1027  int num_decorators;
1028 
1029  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1033  case 0:
1034  break;
1035  default:
1036  return -1;
1037  }
1038 
1039  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1041  return -1;
1042 
1043  num_decorators = 0;
1044 
1045  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1046  decorators_ret[num_decorators++] = "xml_text_escape";
1047  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1048  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1049  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1050  decorators_ret[num_decorators++] = "xml_attr_quote";
1051 
1052  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1053  decorators_ret[num_decorators++] = "crlf_newline";
1054  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1055  decorators_ret[num_decorators++] = "cr_newline";
1056  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1057  decorators_ret[num_decorators++] = "universal_newline";
1058 
1059  return num_decorators;
1060 }
1061 
1062 rb_econv_t *
1063 rb_econv_open(const char *sname, const char *dname, int ecflags)
1064 {
1065  rb_econv_t *ec;
1066  int num_decorators;
1067  const char *decorators[MAX_ECFLAGS_DECORATORS];
1068  int i;
1069 
1070  num_decorators = decorator_names(ecflags, decorators);
1071  if (num_decorators == -1)
1072  return NULL;
1073 
1074  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1075  if (!ec)
1076  return NULL;
1077 
1078  for (i = 0; i < num_decorators; i++)
1079  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1080  rb_econv_close(ec);
1081  return NULL;
1082  }
1083 
1084  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1085 
1086  return ec;
1087 }
1088 
1089 static int
1090 trans_sweep(rb_econv_t *ec,
1091  const unsigned char **input_ptr, const unsigned char *input_stop,
1092  unsigned char **output_ptr, unsigned char *output_stop,
1093  int flags,
1094  int start)
1095 {
1096  int try;
1097  int i, f;
1098 
1099  const unsigned char **ipp, *is, *iold;
1100  unsigned char **opp, *os, *oold;
1101  rb_econv_result_t res;
1102 
1103  try = 1;
1104  while (try) {
1105  try = 0;
1106  for (i = start; i < ec->num_trans; i++) {
1107  rb_econv_elem_t *te = &ec->elems[i];
1108 
1109  if (i == 0) {
1110  ipp = input_ptr;
1111  is = input_stop;
1112  }
1113  else {
1114  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1115  ipp = (const unsigned char **)&prev_te->out_data_start;
1116  is = prev_te->out_data_end;
1117  }
1118 
1119  if (i == ec->num_trans-1) {
1120  opp = output_ptr;
1121  os = output_stop;
1122  }
1123  else {
1124  if (te->out_buf_start != te->out_data_start) {
1125  ssize_t len = te->out_data_end - te->out_data_start;
1126  ssize_t off = te->out_data_start - te->out_buf_start;
1127  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1128  te->out_data_start = te->out_buf_start;
1129  te->out_data_end -= off;
1130  }
1131  opp = &te->out_data_end;
1132  os = te->out_buf_end;
1133  }
1134 
1135  f = flags;
1136  if (ec->num_finished != i)
1137  f |= ECONV_PARTIAL_INPUT;
1138  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1139  start = 1;
1140  flags &= ~ECONV_AFTER_OUTPUT;
1141  }
1142  if (i != 0)
1143  f &= ~ECONV_AFTER_OUTPUT;
1144  iold = *ipp;
1145  oold = *opp;
1146  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1147  if (iold != *ipp || oold != *opp)
1148  try = 1;
1149 
1150  switch (res) {
1154  case econv_after_output:
1155  return i;
1156 
1159  break;
1160 
1161  case econv_finished:
1162  ec->num_finished = i+1;
1163  break;
1164  }
1165  }
1166  }
1167  return -1;
1168 }
1169 
1170 static rb_econv_result_t
1171 rb_trans_conv(rb_econv_t *ec,
1172  const unsigned char **input_ptr, const unsigned char *input_stop,
1173  unsigned char **output_ptr, unsigned char *output_stop,
1174  int flags,
1175  int *result_position_ptr)
1176 {
1177  int i;
1178  int needreport_index;
1179  int sweep_start;
1180 
1181  unsigned char empty_buf;
1182  unsigned char *empty_ptr = &empty_buf;
1183 
1184  if (!input_ptr) {
1185  input_ptr = (const unsigned char **)&empty_ptr;
1186  input_stop = empty_ptr;
1187  }
1188 
1189  if (!output_ptr) {
1190  output_ptr = &empty_ptr;
1191  output_stop = empty_ptr;
1192  }
1193 
1194  if (ec->elems[0].last_result == econv_after_output)
1196 
1197  needreport_index = -1;
1198  for (i = ec->num_trans-1; 0 <= i; i--) {
1199  switch (ec->elems[i].last_result) {
1203  case econv_after_output:
1204  case econv_finished:
1205  sweep_start = i+1;
1206  needreport_index = i;
1207  goto found_needreport;
1208 
1211  break;
1212 
1213  default:
1214  rb_bug("unexpected transcode last result");
1215  }
1216  }
1217 
1218  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1219 
1221  (flags & ECONV_AFTER_OUTPUT)) {
1222  rb_econv_result_t res;
1223 
1224  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1225  (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
1226  result_position_ptr);
1227 
1228  if (res == econv_source_buffer_empty)
1229  return econv_after_output;
1230  return res;
1231  }
1232 
1233  sweep_start = 0;
1234 
1235  found_needreport:
1236 
1237  do {
1238  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1239  sweep_start = needreport_index + 1;
1240  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1241 
1242  for (i = ec->num_trans-1; 0 <= i; i--) {
1244  rb_econv_result_t res = ec->elems[i].last_result;
1245  if (res == econv_invalid_byte_sequence ||
1246  res == econv_incomplete_input ||
1247  res == econv_undefined_conversion ||
1248  res == econv_after_output) {
1250  }
1251  if (result_position_ptr)
1252  *result_position_ptr = i;
1253  return res;
1254  }
1255  }
1256  if (result_position_ptr)
1257  *result_position_ptr = -1;
1259 }
1260 
1261 static rb_econv_result_t
1262 rb_econv_convert0(rb_econv_t *ec,
1263  const unsigned char **input_ptr, const unsigned char *input_stop,
1264  unsigned char **output_ptr, unsigned char *output_stop,
1265  int flags)
1266 {
1267  rb_econv_result_t res;
1268  int result_position;
1269  int has_output = 0;
1270 
1271  memset(&ec->last_error, 0, sizeof(ec->last_error));
1272 
1273  if (ec->num_trans == 0) {
1274  size_t len;
1275  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1276  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1277  len = output_stop - *output_ptr;
1278  memcpy(*output_ptr, ec->in_data_start, len);
1279  *output_ptr = output_stop;
1280  ec->in_data_start += len;
1282  goto gotresult;
1283  }
1284  len = ec->in_data_end - ec->in_data_start;
1285  memcpy(*output_ptr, ec->in_data_start, len);
1286  *output_ptr += len;
1287  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1288  if (flags & ECONV_AFTER_OUTPUT) {
1289  res = econv_after_output;
1290  goto gotresult;
1291  }
1292  }
1293  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1294  len = output_stop - *output_ptr;
1295  }
1296  else {
1297  len = input_stop - *input_ptr;
1298  }
1299  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1300  *(*output_ptr)++ = *(*input_ptr)++;
1301  res = econv_after_output;
1302  goto gotresult;
1303  }
1304  memcpy(*output_ptr, *input_ptr, len);
1305  *output_ptr += len;
1306  *input_ptr += len;
1307  if (*input_ptr != input_stop)
1309  else if (flags & ECONV_PARTIAL_INPUT)
1311  else
1312  res = econv_finished;
1313  goto gotresult;
1314  }
1315 
1316  if (ec->elems[ec->num_trans-1].out_data_start) {
1317  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1318  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1319  if (data_start != data_end) {
1320  size_t len;
1321  if (output_stop - *output_ptr < data_end - data_start) {
1322  len = output_stop - *output_ptr;
1323  memcpy(*output_ptr, data_start, len);
1324  *output_ptr = output_stop;
1325  ec->elems[ec->num_trans-1].out_data_start += len;
1327  goto gotresult;
1328  }
1329  len = data_end - data_start;
1330  memcpy(*output_ptr, data_start, len);
1331  *output_ptr += len;
1332  ec->elems[ec->num_trans-1].out_data_start =
1333  ec->elems[ec->num_trans-1].out_data_end =
1334  ec->elems[ec->num_trans-1].out_buf_start;
1335  has_output = 1;
1336  }
1337  }
1338 
1339  if (ec->in_buf_start &&
1340  ec->in_data_start != ec->in_data_end) {
1341  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1342  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1343  if (res != econv_source_buffer_empty)
1344  goto gotresult;
1345  }
1346 
1347  if (has_output &&
1348  (flags & ECONV_AFTER_OUTPUT) &&
1349  *input_ptr != input_stop) {
1350  input_stop = *input_ptr;
1351  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1352  if (res == econv_source_buffer_empty)
1353  res = econv_after_output;
1354  }
1355  else if ((flags & ECONV_AFTER_OUTPUT) ||
1356  ec->num_trans == 1) {
1357  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1358  }
1359  else {
1360  flags |= ECONV_AFTER_OUTPUT;
1361  do {
1362  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1363  } while (res == econv_after_output);
1364  }
1365 
1366  gotresult:
1367  ec->last_error.result = res;
1368  if (res == econv_invalid_byte_sequence ||
1369  res == econv_incomplete_input ||
1370  res == econv_undefined_conversion) {
1371  rb_transcoding *error_tc = ec->elems[result_position].tc;
1372  ec->last_error.error_tc = error_tc;
1376  ec->last_error.error_bytes_len = error_tc->recognized_len;
1377  ec->last_error.readagain_len = error_tc->readagain_len;
1378  }
1379 
1380  return res;
1381 }
1382 
1383 static int output_replacement_character(rb_econv_t *ec);
1384 
1385 static int
1386 output_hex_charref(rb_econv_t *ec)
1387 {
1388  int ret;
1389  unsigned char utfbuf[1024];
1390  const unsigned char *utf;
1391  size_t utf_len;
1392  int utf_allocated = 0;
1393  char charef_buf[16];
1394  const unsigned char *p;
1395 
1396  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1397  utf = ec->last_error.error_bytes_start;
1398  utf_len = ec->last_error.error_bytes_len;
1399  }
1400  else {
1401  utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1403  utfbuf, sizeof(utfbuf),
1404  &utf_len);
1405  if (!utf)
1406  return -1;
1407  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1408  utf_allocated = 1;
1409  }
1410 
1411  if (utf_len % 4 != 0)
1412  goto fail;
1413 
1414  p = utf;
1415  while (4 <= utf_len) {
1416  unsigned int u = 0;
1417  u += p[0] << 24;
1418  u += p[1] << 16;
1419  u += p[2] << 8;
1420  u += p[3];
1421  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1422 
1423  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1424  if (ret == -1)
1425  goto fail;
1426 
1427  p += 4;
1428  utf_len -= 4;
1429  }
1430 
1431  if (utf_allocated)
1432  xfree((void *)utf);
1433  return 0;
1434 
1435  fail:
1436  if (utf_allocated)
1437  xfree((void *)utf);
1438  return -1;
1439 }
1440 
1443  const unsigned char **input_ptr, const unsigned char *input_stop,
1444  unsigned char **output_ptr, unsigned char *output_stop,
1445  int flags)
1446 {
1447  rb_econv_result_t ret;
1448 
1449  unsigned char empty_buf;
1450  unsigned char *empty_ptr = &empty_buf;
1451 
1452  ec->started = 1;
1453 
1454  if (!input_ptr) {
1455  input_ptr = (const unsigned char **)&empty_ptr;
1456  input_stop = empty_ptr;
1457  }
1458 
1459  if (!output_ptr) {
1460  output_ptr = &empty_ptr;
1461  output_stop = empty_ptr;
1462  }
1463 
1464  resume:
1465  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1466 
1467  if (ret == econv_invalid_byte_sequence ||
1468  ret == econv_incomplete_input) {
1469  /* deal with invalid byte sequence */
1470  /* todo: add more alternative behaviors */
1471  switch (ec->flags & ECONV_INVALID_MASK) {
1472  case ECONV_INVALID_REPLACE:
1473  if (output_replacement_character(ec) == 0)
1474  goto resume;
1475  }
1476  }
1477 
1478  if (ret == econv_undefined_conversion) {
1479  /* valid character in source encoding
1480  * but no related character(s) in destination encoding */
1481  /* todo: add more alternative behaviors */
1482  switch (ec->flags & ECONV_UNDEF_MASK) {
1483  case ECONV_UNDEF_REPLACE:
1484  if (output_replacement_character(ec) == 0)
1485  goto resume;
1486  break;
1487 
1489  if (output_hex_charref(ec) == 0)
1490  goto resume;
1491  break;
1492  }
1493  }
1494 
1495  return ret;
1496 }
1497 
1498 const char *
1500 {
1501  rb_transcoding *tc = ec->last_tc;
1502  const rb_transcoder *tr;
1503 
1504  if (tc == NULL)
1505  return "";
1506 
1507  tr = tc->transcoder;
1508 
1510  return tr->src_encoding;
1511  return tr->dst_encoding;
1512 }
1513 
1514 static unsigned char *
1515 allocate_converted_string(const char *sname, const char *dname,
1516  const unsigned char *str, size_t len,
1517  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1518  size_t *dst_len_ptr)
1519 {
1520  unsigned char *dst_str;
1521  size_t dst_len;
1522  size_t dst_bufsize;
1523 
1524  rb_econv_t *ec;
1525  rb_econv_result_t res;
1526 
1527  const unsigned char *sp;
1528  unsigned char *dp;
1529 
1530  if (caller_dst_buf)
1531  dst_bufsize = caller_dst_bufsize;
1532  else if (len == 0)
1533  dst_bufsize = 1;
1534  else
1535  dst_bufsize = len;
1536 
1537  ec = rb_econv_open(sname, dname, 0);
1538  if (ec == NULL)
1539  return NULL;
1540  if (caller_dst_buf)
1541  dst_str = caller_dst_buf;
1542  else
1543  dst_str = xmalloc(dst_bufsize);
1544  dst_len = 0;
1545  sp = str;
1546  dp = dst_str+dst_len;
1547  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1548  dst_len = dp - dst_str;
1549  while (res == econv_destination_buffer_full) {
1550  if (SIZE_MAX/2 < dst_bufsize) {
1551  goto fail;
1552  }
1553  dst_bufsize *= 2;
1554  if (dst_str == caller_dst_buf) {
1555  unsigned char *tmp;
1556  tmp = xmalloc(dst_bufsize);
1557  memcpy(tmp, dst_str, dst_bufsize/2);
1558  dst_str = tmp;
1559  }
1560  else {
1561  dst_str = xrealloc(dst_str, dst_bufsize);
1562  }
1563  dp = dst_str+dst_len;
1564  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1565  dst_len = dp - dst_str;
1566  }
1567  if (res != econv_finished) {
1568  goto fail;
1569  }
1570  rb_econv_close(ec);
1571  *dst_len_ptr = dst_len;
1572  return dst_str;
1573 
1574  fail:
1575  if (dst_str != caller_dst_buf)
1576  xfree(dst_str);
1577  rb_econv_close(ec);
1578  return NULL;
1579 }
1580 
1581 /* result: 0:success -1:failure */
1582 int
1584  const unsigned char *str, size_t len, const char *str_encoding)
1585 {
1586  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1587  unsigned char insert_buf[4096];
1588  const unsigned char *insert_str = NULL;
1589  size_t insert_len;
1590 
1591  int last_trans_index;
1592  rb_transcoding *tc;
1593 
1594  unsigned char **buf_start_p;
1595  unsigned char **data_start_p;
1596  unsigned char **data_end_p;
1597  unsigned char **buf_end_p;
1598 
1599  size_t need;
1600 
1601  ec->started = 1;
1602 
1603  if (len == 0)
1604  return 0;
1605 
1606  if (encoding_equal(insert_encoding, str_encoding)) {
1607  insert_str = str;
1608  insert_len = len;
1609  }
1610  else {
1611  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1612  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1613  if (insert_str == NULL)
1614  return -1;
1615  }
1616 
1617  need = insert_len;
1618 
1619  last_trans_index = ec->num_trans-1;
1620  if (ec->num_trans == 0) {
1621  tc = NULL;
1622  buf_start_p = &ec->in_buf_start;
1623  data_start_p = &ec->in_data_start;
1624  data_end_p = &ec->in_data_end;
1625  buf_end_p = &ec->in_buf_end;
1626  }
1627  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1628  tc = ec->elems[last_trans_index].tc;
1629  need += tc->readagain_len;
1630  if (need < insert_len)
1631  goto fail;
1632  if (last_trans_index == 0) {
1633  buf_start_p = &ec->in_buf_start;
1634  data_start_p = &ec->in_data_start;
1635  data_end_p = &ec->in_data_end;
1636  buf_end_p = &ec->in_buf_end;
1637  }
1638  else {
1639  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1640  buf_start_p = &ee->out_buf_start;
1641  data_start_p = &ee->out_data_start;
1642  data_end_p = &ee->out_data_end;
1643  buf_end_p = &ee->out_buf_end;
1644  }
1645  }
1646  else {
1647  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1648  buf_start_p = &ee->out_buf_start;
1649  data_start_p = &ee->out_data_start;
1650  data_end_p = &ee->out_data_end;
1651  buf_end_p = &ee->out_buf_end;
1652  tc = ec->elems[last_trans_index].tc;
1653  }
1654 
1655  if (*buf_start_p == NULL) {
1656  unsigned char *buf = xmalloc(need);
1657  *buf_start_p = buf;
1658  *data_start_p = buf;
1659  *data_end_p = buf;
1660  *buf_end_p = buf+need;
1661  }
1662  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1663  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1664  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1665  *data_start_p = *buf_start_p;
1666  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1667  unsigned char *buf;
1668  size_t s = (*data_end_p - *buf_start_p) + need;
1669  if (s < need)
1670  goto fail;
1671  buf = xrealloc(*buf_start_p, s);
1672  *data_start_p = buf;
1673  *data_end_p = buf + (*data_end_p - *buf_start_p);
1674  *buf_start_p = buf;
1675  *buf_end_p = buf + s;
1676  }
1677  }
1678 
1679  memcpy(*data_end_p, insert_str, insert_len);
1680  *data_end_p += insert_len;
1681  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1682  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1683  *data_end_p += tc->readagain_len;
1684  tc->readagain_len = 0;
1685  }
1686 
1687  if (insert_str != str && insert_str != insert_buf)
1688  xfree((void*)insert_str);
1689  return 0;
1690 
1691  fail:
1692  if (insert_str != str && insert_str != insert_buf)
1693  xfree((void*)insert_str);
1694  return -1;
1695 }
1696 
1697 void
1699 {
1700  int i;
1701 
1702  if (ec->replacement_allocated) {
1703  xfree((void *)ec->replacement_str);
1704  }
1705  for (i = 0; i < ec->num_trans; i++) {
1706  rb_transcoding_close(ec->elems[i].tc);
1707  if (ec->elems[i].out_buf_start)
1708  xfree(ec->elems[i].out_buf_start);
1709  }
1710  xfree(ec->in_buf_start);
1711  xfree(ec->elems);
1712  xfree(ec);
1713 }
1714 
1715 size_t
1717 {
1718  size_t size = sizeof(rb_econv_t);
1719  int i;
1720 
1721  if (ec->replacement_allocated) {
1722  size += ec->replacement_len;
1723  }
1724  for (i = 0; i < ec->num_trans; i++) {
1725  size += rb_transcoding_memsize(ec->elems[i].tc);
1726 
1727  if (ec->elems[i].out_buf_start) {
1728  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1729  }
1730  }
1731  size += ec->in_buf_end - ec->in_buf_start;
1732  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1733 
1734  return size;
1735 }
1736 
1737 int
1739 {
1740  if (ec->num_trans == 0)
1741  return 0;
1742 #if SIZEOF_SIZE_T > SIZEOF_INT
1743  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1744 #endif
1745  return (int)ec->elems[0].tc->readagain_len;
1746 }
1747 
1748 void
1749 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1750 {
1751  rb_transcoding *tc;
1752  if (ec->num_trans == 0 || n == 0)
1753  return;
1754  tc = ec->elems[0].tc;
1755  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1756  tc->readagain_len -= n;
1757 }
1758 
1760  const char *ascii_compat_name;
1761  const char *ascii_incompat_name;
1762 };
1763 
1764 static int
1765 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1766 {
1767  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1768  transcoder_entry_t *entry = (transcoder_entry_t *)val;
1769  const rb_transcoder *tr;
1770 
1771  if (DECORATOR_P(entry->sname, entry->dname))
1772  return ST_CONTINUE;
1773  tr = load_transcoder_entry(entry);
1774  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1775  data->ascii_compat_name = tr->dst_encoding;
1776  return ST_STOP;
1777  }
1778  return ST_CONTINUE;
1779 }
1780 
1781 const char *
1783 {
1784  st_data_t v;
1785  st_table *table2;
1786  struct asciicompat_encoding_t data;
1787 
1788  if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1789  return NULL;
1790  table2 = (st_table *)v;
1791 
1792  /*
1793  * Assumption:
1794  * There is at most one transcoder for
1795  * converting from ASCII incompatible encoding.
1796  *
1797  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1798  */
1799  if (table2->num_entries != 1)
1800  return NULL;
1801 
1803  data.ascii_compat_name = NULL;
1804  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1805  return data.ascii_compat_name;
1806 }
1807 
1808 VALUE
1809 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1810 {
1811  unsigned const char *sp, *se;
1812  unsigned char *ds, *dp, *de;
1813  rb_econv_result_t res;
1814  int max_output;
1815 
1816  if (NIL_P(dst)) {
1817  dst = rb_str_buf_new(len);
1818  if (ec->destination_encoding)
1820  }
1821 
1822  if (ec->last_tc)
1823  max_output = ec->last_tc->transcoder->max_output;
1824  else
1825  max_output = 1;
1826 
1827  do {
1828  long dlen = RSTRING_LEN(dst);
1829  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1830  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1831  if (LONG_MAX < new_capa)
1832  rb_raise(rb_eArgError, "too long string");
1833  rb_str_resize(dst, new_capa);
1834  rb_str_set_len(dst, dlen);
1835  }
1836  sp = (const unsigned char *)ss;
1837  se = sp + len;
1838  ds = (unsigned char *)RSTRING_PTR(dst);
1839  de = ds + rb_str_capacity(dst);
1840  dp = ds += dlen;
1841  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1842  len -= (const char *)sp - ss;
1843  ss = (const char *)sp;
1844  rb_str_set_len(dst, dlen + (dp - ds));
1846  } while (res == econv_destination_buffer_full);
1847 
1848  return dst;
1849 }
1850 
1851 VALUE
1852 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1853 {
1854  src = rb_str_new_frozen(src);
1855  dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1856  RB_GC_GUARD(src);
1857  OBJ_INFECT_RAW(dst, src);
1858  return dst;
1859 }
1860 
1861 VALUE
1862 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1863 {
1864  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1865 }
1866 
1867 VALUE
1868 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1869 {
1870  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1871 }
1872 
1873 VALUE
1875 {
1876  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1877 }
1878 
1879 static int
1880 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1881 {
1882  transcoder_entry_t *entry;
1883  const rb_transcoder *tr;
1884 
1885  if (ec->started != 0)
1886  return -1;
1887 
1888  entry = get_transcoder_entry(sname, dname);
1889  if (!entry)
1890  return -1;
1891 
1892  tr = load_transcoder_entry(entry);
1893  if (!tr) return -1;
1894 
1895  return rb_econv_add_transcoder_at(ec, tr, n);
1896 }
1897 
1898 static int
1899 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1900 {
1901  return rb_econv_add_converter(ec, "", decorator_name, n);
1902 }
1903 
1904 int
1905 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1906 {
1907  const rb_transcoder *tr;
1908 
1909  if (ec->num_trans == 0)
1910  return rb_econv_decorate_at(ec, decorator_name, 0);
1911 
1912  tr = ec->elems[0].tc->transcoder;
1913 
1914  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1916  return rb_econv_decorate_at(ec, decorator_name, 1);
1917 
1918  return rb_econv_decorate_at(ec, decorator_name, 0);
1919 }
1920 
1921 int
1922 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1923 {
1924  const rb_transcoder *tr;
1925 
1926  if (ec->num_trans == 0)
1927  return rb_econv_decorate_at(ec, decorator_name, 0);
1928 
1929  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1930 
1931  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1933  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1934 
1935  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1936 }
1937 
1938 void
1940 {
1941  const char *dname = 0;
1942 
1943  switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1945  dname = "universal_newline";
1946  break;
1948  dname = "crlf_newline";
1949  break;
1951  dname = "cr_newline";
1952  break;
1953  }
1954 
1955  if (dname) {
1956  const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1957  int num_trans = ec->num_trans;
1958  int i, j = 0;
1959 
1960  for (i=0; i < num_trans; i++) {
1961  if (transcoder == ec->elems[i].tc->transcoder) {
1962  rb_transcoding_close(ec->elems[i].tc);
1963  xfree(ec->elems[i].out_buf_start);
1964  ec->num_trans--;
1965  }
1966  else
1967  ec->elems[j++] = ec->elems[i];
1968  }
1969  }
1970 
1972 }
1973 
1974 static VALUE
1975 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1976 {
1977  int has_description = 0;
1978 
1979  if (NIL_P(mesg))
1980  mesg = rb_str_new(NULL, 0);
1981 
1982  if (*sname != '\0' || *dname != '\0') {
1983  if (*sname == '\0')
1984  rb_str_cat2(mesg, dname);
1985  else if (*dname == '\0')
1986  rb_str_cat2(mesg, sname);
1987  else
1988  rb_str_catf(mesg, "%s to %s", sname, dname);
1989  has_description = 1;
1990  }
1991 
1992  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1996  const char *pre = "";
1997  if (has_description)
1998  rb_str_cat2(mesg, " with ");
1999  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2000  rb_str_cat2(mesg, pre); pre = ",";
2001  rb_str_cat2(mesg, "universal_newline");
2002  }
2003  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2004  rb_str_cat2(mesg, pre); pre = ",";
2005  rb_str_cat2(mesg, "crlf_newline");
2006  }
2007  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2008  rb_str_cat2(mesg, pre); pre = ",";
2009  rb_str_cat2(mesg, "cr_newline");
2010  }
2011  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2012  rb_str_cat2(mesg, pre); pre = ",";
2013  rb_str_cat2(mesg, "xml_text");
2014  }
2015  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2016  rb_str_cat2(mesg, pre); pre = ",";
2017  rb_str_cat2(mesg, "xml_attr_content");
2018  }
2019  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2020  rb_str_cat2(mesg, pre); pre = ",";
2021  rb_str_cat2(mesg, "xml_attr_quote");
2022  }
2023  has_description = 1;
2024  }
2025  if (!has_description) {
2026  rb_str_cat2(mesg, "no-conversion");
2027  }
2028 
2029  return mesg;
2030 }
2031 
2032 VALUE
2033 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2034 {
2035  VALUE mesg, exc;
2036  mesg = rb_str_new_cstr("code converter not found (");
2037  econv_description(sname, dname, ecflags, mesg);
2038  rb_str_cat2(mesg, ")");
2040  return exc;
2041 }
2042 
2043 static VALUE
2044 make_econv_exception(rb_econv_t *ec)
2045 {
2046  VALUE mesg, exc;
2049  const char *err = (const char *)ec->last_error.error_bytes_start;
2050  size_t error_len = ec->last_error.error_bytes_len;
2051  VALUE bytes = rb_str_new(err, error_len);
2052  VALUE dumped = rb_str_dump(bytes);
2053  size_t readagain_len = ec->last_error.readagain_len;
2054  VALUE bytes2 = Qnil;
2055  VALUE dumped2;
2056  int idx;
2058  mesg = rb_sprintf("incomplete %s on %s",
2059  StringValueCStr(dumped),
2061  }
2062  else if (readagain_len) {
2063  bytes2 = rb_str_new(err+error_len, readagain_len);
2064  dumped2 = rb_str_dump(bytes2);
2065  mesg = rb_sprintf("%s followed by %s on %s",
2066  StringValueCStr(dumped),
2067  StringValueCStr(dumped2),
2069  }
2070  else {
2071  mesg = rb_sprintf("%s on %s",
2072  StringValueCStr(dumped),
2074  }
2075 
2077  rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2078  rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2079  rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2080 
2081  set_encs:
2082  rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2083  rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2085  if (0 <= idx)
2086  rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2088  if (0 <= idx)
2089  rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2090  return exc;
2091  }
2093  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2095  VALUE dumped = Qnil;
2096  int idx;
2097  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2098  rb_encoding *utf8 = rb_utf8_encoding();
2099  const char *start, *end;
2100  int n;
2101  start = (const char *)ec->last_error.error_bytes_start;
2102  end = start + ec->last_error.error_bytes_len;
2103  n = rb_enc_precise_mbclen(start, end, utf8);
2104  if (MBCLEN_CHARFOUND_P(n) &&
2105  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2106  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2107  dumped = rb_sprintf("U+%04X", cc);
2108  }
2109  }
2110  if (dumped == Qnil)
2111  dumped = rb_str_dump(bytes);
2112  if (strcmp(ec->last_error.source_encoding,
2113  ec->source_encoding_name) == 0 &&
2114  strcmp(ec->last_error.destination_encoding,
2115  ec->destination_encoding_name) == 0) {
2116  mesg = rb_sprintf("%s from %s to %s",
2117  StringValueCStr(dumped),
2120  }
2121  else {
2122  int i;
2123  mesg = rb_sprintf("%s to %s in conversion from %s",
2124  StringValueCStr(dumped),
2126  ec->source_encoding_name);
2127  for (i = 0; i < ec->num_trans; i++) {
2128  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2129  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2130  rb_str_catf(mesg, " to %s",
2131  ec->elems[i].tc->transcoder->dst_encoding);
2132  }
2133  }
2136  if (0 <= idx)
2137  rb_enc_associate_index(bytes, idx);
2138  rb_ivar_set(exc, rb_intern("error_char"), bytes);
2139  goto set_encs;
2140  }
2141  return Qnil;
2142 }
2143 
2144 static void
2145 more_output_buffer(
2146  VALUE destination,
2147  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2148  int max_output,
2149  unsigned char **out_start_ptr,
2150  unsigned char **out_pos,
2151  unsigned char **out_stop_ptr)
2152 {
2153  size_t len = (*out_pos - *out_start_ptr);
2154  size_t new_len = (len + max_output) * 2;
2155  *out_start_ptr = resize_destination(destination, len, new_len);
2156  *out_pos = *out_start_ptr + len;
2157  *out_stop_ptr = *out_start_ptr + new_len;
2158 }
2159 
2160 static int
2161 make_replacement(rb_econv_t *ec)
2162 {
2163  rb_transcoding *tc;
2164  const rb_transcoder *tr;
2165  const unsigned char *replacement;
2166  const char *repl_enc;
2167  const char *ins_enc;
2168  size_t len;
2169 
2170  if (ec->replacement_str)
2171  return 0;
2172 
2173  ins_enc = rb_econv_encoding_to_insert_output(ec);
2174 
2175  tc = ec->last_tc;
2176  if (*ins_enc) {
2177  tr = tc->transcoder;
2179  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2180  }
2181  else {
2182  replacement = (unsigned char *)"?";
2183  len = 1;
2184  repl_enc = "";
2185  }
2186 
2187  ec->replacement_str = replacement;
2188  ec->replacement_len = len;
2189  ec->replacement_enc = repl_enc;
2190  ec->replacement_allocated = 0;
2191  return 0;
2192 }
2193 
2194 int
2196  const unsigned char *str, size_t len, const char *encname)
2197 {
2198  unsigned char *str2;
2199  size_t len2;
2200  const char *encname2;
2201 
2202  encname2 = rb_econv_encoding_to_insert_output(ec);
2203 
2204  if (!*encname2 || encoding_equal(encname, encname2)) {
2205  str2 = xmalloc(len);
2206  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2207  len2 = len;
2208  encname2 = encname;
2209  }
2210  else {
2211  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2212  if (!str2)
2213  return -1;
2214  }
2215 
2216  if (ec->replacement_allocated) {
2217  xfree((void *)ec->replacement_str);
2218  }
2219  ec->replacement_allocated = 1;
2220  ec->replacement_str = str2;
2221  ec->replacement_len = len2;
2222  ec->replacement_enc = encname2;
2223  return 0;
2224 }
2225 
2226 static int
2227 output_replacement_character(rb_econv_t *ec)
2228 {
2229  int ret;
2230 
2231  if (make_replacement(ec) == -1)
2232  return -1;
2233 
2235  if (ret == -1)
2236  return -1;
2237 
2238  return 0;
2239 }
2240 
2241 #if 1
2242 #define hash_fallback rb_hash_aref
2243 
2244 static VALUE
2245 proc_fallback(VALUE fallback, VALUE c)
2246 {
2247  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2248 }
2249 
2250 static VALUE
2251 method_fallback(VALUE fallback, VALUE c)
2252 {
2253  return rb_method_call(1, &c, fallback);
2254 }
2255 
2256 static VALUE
2257 aref_fallback(VALUE fallback, VALUE c)
2258 {
2259  return rb_funcall3(fallback, sym_aref, 1, &c);
2260 }
2261 
2262 static void
2263 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2264  const unsigned char *in_stop, unsigned char *out_stop,
2265  VALUE destination,
2266  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2267  const char *src_encoding,
2268  const char *dst_encoding,
2269  int ecflags,
2270  VALUE ecopts)
2271 {
2272  rb_econv_t *ec;
2273  rb_transcoding *last_tc;
2274  rb_econv_result_t ret;
2275  unsigned char *out_start = *out_pos;
2276  int max_output;
2277  VALUE exc;
2278  VALUE fallback = Qnil;
2279  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2280 
2281  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2282  if (!ec)
2283  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2284 
2285  if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2286  fallback = rb_hash_aref(ecopts, sym_fallback);
2287  if (RB_TYPE_P(fallback, T_HASH)) {
2288  fallback_func = hash_fallback;
2289  }
2290  else if (rb_obj_is_proc(fallback)) {
2291  fallback_func = proc_fallback;
2292  }
2293  else if (rb_obj_is_method(fallback)) {
2294  fallback_func = method_fallback;
2295  }
2296  else {
2297  fallback_func = aref_fallback;
2298  }
2299  }
2300  last_tc = ec->last_tc;
2301  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2302 
2303  resume:
2304  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2305 
2306  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2307  VALUE rep = rb_enc_str_new(
2308  (const char *)ec->last_error.error_bytes_start,
2311  rep = (*fallback_func)(fallback, rep);
2312  if (rep != Qundef && !NIL_P(rep)) {
2313  StringValue(rep);
2314  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2315  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2316  if ((int)ret == -1) {
2317  rb_raise(rb_eArgError, "too big fallback string");
2318  }
2319  goto resume;
2320  }
2321  }
2322 
2323  if (ret == econv_invalid_byte_sequence ||
2324  ret == econv_incomplete_input ||
2325  ret == econv_undefined_conversion) {
2326  exc = make_econv_exception(ec);
2327  rb_econv_close(ec);
2328  rb_exc_raise(exc);
2329  }
2330 
2331  if (ret == econv_destination_buffer_full) {
2332  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2333  goto resume;
2334  }
2335 
2336  rb_econv_close(ec);
2337  return;
2338 }
2339 #else
2340 /* sample transcode_loop implementation in byte-by-byte stream style */
2341 static void
2342 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2343  const unsigned char *in_stop, unsigned char *out_stop,
2344  VALUE destination,
2345  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2346  const char *src_encoding,
2347  const char *dst_encoding,
2348  int ecflags,
2349  VALUE ecopts)
2350 {
2351  rb_econv_t *ec;
2352  rb_transcoding *last_tc;
2353  rb_econv_result_t ret;
2354  unsigned char *out_start = *out_pos;
2355  const unsigned char *ptr;
2356  int max_output;
2357  VALUE exc;
2358 
2359  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2360  if (!ec)
2361  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2362 
2363  last_tc = ec->last_tc;
2364  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2365 
2367  ptr = *in_pos;
2368  while (ret != econv_finished) {
2369  unsigned char input_byte;
2370  const unsigned char *p = &input_byte;
2371 
2372  if (ret == econv_source_buffer_empty) {
2373  if (ptr < in_stop) {
2374  input_byte = *ptr;
2375  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2376  }
2377  else {
2378  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2379  }
2380  }
2381  else {
2382  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2383  }
2384  if (&input_byte != p)
2385  ptr += p - &input_byte;
2386  switch (ret) {
2390  exc = make_econv_exception(ec);
2391  rb_econv_close(ec);
2392  rb_exc_raise(exc);
2393  break;
2394 
2396  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2397  break;
2398 
2400  break;
2401 
2402  case econv_finished:
2403  break;
2404  }
2405  }
2406  rb_econv_close(ec);
2407  *in_pos = in_stop;
2408  return;
2409 }
2410 #endif
2411 
2412 
2413 /*
2414  * String-specific code
2415  */
2416 
2417 static unsigned char *
2418 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2419 {
2420  rb_str_resize(destination, new_len);
2421  return (unsigned char *)RSTRING_PTR(destination);
2422 }
2423 
2424 static int
2425 econv_opts(VALUE opt, int ecflags)
2426 {
2427  VALUE v;
2428 
2429  v = rb_hash_aref(opt, sym_invalid);
2430  if (NIL_P(v)) {
2431  }
2432  else if (v==sym_replace) {
2433  ecflags |= ECONV_INVALID_REPLACE;
2434  }
2435  else {
2436  rb_raise(rb_eArgError, "unknown value for invalid character option");
2437  }
2438 
2439  v = rb_hash_aref(opt, sym_undef);
2440  if (NIL_P(v)) {
2441  }
2442  else if (v==sym_replace) {
2443  ecflags |= ECONV_UNDEF_REPLACE;
2444  }
2445  else {
2446  rb_raise(rb_eArgError, "unknown value for undefined character option");
2447  }
2448 
2449  v = rb_hash_aref(opt, sym_replace);
2450  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2451  ecflags |= ECONV_UNDEF_REPLACE;
2452  }
2453 
2454  v = rb_hash_aref(opt, sym_xml);
2455  if (!NIL_P(v)) {
2456  if (v==sym_text) {
2458  }
2459  else if (v==sym_attr) {
2461  }
2462  else if (RB_TYPE_P(v, T_SYMBOL)) {
2463  rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2464  }
2465  else {
2466  rb_raise(rb_eArgError, "unexpected value for xml option");
2467  }
2468  }
2469 
2470 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2471  v = rb_hash_aref(opt, sym_newline);
2472  if (!NIL_P(v)) {
2473  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2474  if (v == sym_universal) {
2476  }
2477  else if (v == sym_crlf) {
2478  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2479  }
2480  else if (v == sym_cr) {
2481  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2482  }
2483  else if (v == sym_lf) {
2484  /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2485  }
2486  else if (SYMBOL_P(v)) {
2487  rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2488  rb_sym2str(v));
2489  }
2490  else {
2491  rb_raise(rb_eArgError, "unexpected value for newline option");
2492  }
2493  }
2494  else
2495 #endif
2496  {
2497  int setflags = 0, newlineflag = 0;
2498 
2499  v = rb_hash_aref(opt, sym_universal_newline);
2500  if (RTEST(v))
2502  newlineflag |= !NIL_P(v);
2503 
2504  v = rb_hash_aref(opt, sym_crlf_newline);
2505  if (RTEST(v))
2506  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2507  newlineflag |= !NIL_P(v);
2508 
2509  v = rb_hash_aref(opt, sym_cr_newline);
2510  if (RTEST(v))
2511  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2512  newlineflag |= !NIL_P(v);
2513 
2514  if (newlineflag) {
2515  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2516  ecflags |= setflags;
2517  }
2518  }
2519 
2520  return ecflags;
2521 }
2522 
2523 int
2524 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2525 {
2526  VALUE newhash = Qnil;
2527  VALUE v;
2528 
2529  if (NIL_P(opthash)) {
2530  *opts = Qnil;
2531  return ecflags;
2532  }
2533  ecflags = econv_opts(opthash, ecflags);
2534 
2535  v = rb_hash_aref(opthash, sym_replace);
2536  if (!NIL_P(v)) {
2537  StringValue(v);
2539  VALUE dumped = rb_str_dump(v);
2540  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2541  StringValueCStr(dumped),
2542  rb_enc_name(rb_enc_get(v)));
2543  }
2544  v = rb_str_new_frozen(v);
2545  newhash = rb_hash_new();
2546  rb_hash_aset(newhash, sym_replace, v);
2547  }
2548 
2549  v = rb_hash_aref(opthash, sym_fallback);
2550  if (!NIL_P(v)) {
2551  VALUE h = rb_check_hash_type(v);
2552  if (NIL_P(h)
2553  ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref))
2554  : (v = h, 1)) {
2555  if (NIL_P(newhash))
2556  newhash = rb_hash_new();
2557  rb_hash_aset(newhash, sym_fallback, v);
2558  }
2559  }
2560 
2561  if (!NIL_P(newhash))
2562  rb_hash_freeze(newhash);
2563  *opts = newhash;
2564 
2565  return ecflags;
2566 }
2567 
2568 int
2570 {
2571  return rb_econv_prepare_options(opthash, opts, 0);
2572 }
2573 
2574 rb_econv_t *
2575 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2576 {
2577  rb_econv_t *ec;
2578  VALUE replacement;
2579 
2580  if (NIL_P(opthash)) {
2581  replacement = Qnil;
2582  }
2583  else {
2584  if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2585  rb_bug("rb_econv_open_opts called with invalid opthash");
2586  replacement = rb_hash_aref(opthash, sym_replace);
2587  }
2588 
2589  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2590  if (!ec)
2591  return ec;
2592 
2593  if (!NIL_P(replacement)) {
2594  int ret;
2595  rb_encoding *enc = rb_enc_get(replacement);
2596 
2597  ret = rb_econv_set_replacement(ec,
2598  (const unsigned char *)RSTRING_PTR(replacement),
2599  RSTRING_LEN(replacement),
2600  rb_enc_name(enc));
2601  if (ret == -1) {
2602  rb_econv_close(ec);
2603  return NULL;
2604  }
2605  }
2606  return ec;
2607 }
2608 
2609 static int
2610 enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2611 {
2612  rb_encoding *enc;
2613  const char *n;
2614  int encidx;
2615  VALUE encval;
2616 
2617  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2618  !(enc = rb_enc_from_index(encidx))) {
2619  enc = NULL;
2620  encidx = 0;
2621  n = StringValueCStr(*arg);
2622  }
2623  else {
2624  n = rb_enc_name(enc);
2625  }
2626 
2627  *name_p = n;
2628  *enc_p = enc;
2629 
2630  return encidx;
2631 }
2632 
2633 static int
2634 str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2635  const char **sname_p, rb_encoding **senc_p,
2636  const char **dname_p, rb_encoding **denc_p)
2637 {
2638  rb_encoding *senc, *denc;
2639  const char *sname, *dname;
2640  int sencidx, dencidx;
2641 
2642  dencidx = enc_arg(arg1, &dname, &denc);
2643 
2644  if (NIL_P(*arg2)) {
2645  sencidx = rb_enc_get_index(str);
2646  senc = rb_enc_from_index(sencidx);
2647  sname = rb_enc_name(senc);
2648  }
2649  else {
2650  sencidx = enc_arg(arg2, &sname, &senc);
2651  }
2652 
2653  *sname_p = sname;
2654  *senc_p = senc;
2655  *dname_p = dname;
2656  *denc_p = denc;
2657  return dencidx;
2658 }
2659 
2660 static int
2661 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2662 {
2663  VALUE dest;
2664  VALUE str = *self;
2665  VALUE arg1, arg2;
2666  long blen, slen;
2667  unsigned char *buf, *bp, *sp;
2668  const unsigned char *fromp;
2669  rb_encoding *senc, *denc;
2670  const char *sname, *dname;
2671  int dencidx;
2672  int explicitly_invalid_replace = TRUE;
2673 
2674  rb_check_arity(argc, 0, 2);
2675 
2676  if (argc == 0) {
2677  arg1 = rb_enc_default_internal();
2678  if (NIL_P(arg1)) {
2679  if (!ecflags) return -1;
2680  arg1 = rb_obj_encoding(str);
2681  }
2682  if (!(ecflags & ECONV_INVALID_MASK)) {
2683  explicitly_invalid_replace = FALSE;
2684  }
2686  }
2687  else {
2688  arg1 = argv[0];
2689  }
2690  arg2 = argc<=1 ? Qnil : argv[1];
2691  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2692 
2693  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2697  if (senc && senc == denc) {
2698  if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2699  VALUE rep = Qnil;
2700  if (!NIL_P(ecopts)) {
2701  rep = rb_hash_aref(ecopts, sym_replace);
2702  }
2703  dest = rb_enc_str_scrub(senc, str, rep);
2704  if (NIL_P(dest)) dest = str;
2705  *self = dest;
2706  return dencidx;
2707  }
2708  return NIL_P(arg2) ? -1 : dencidx;
2709  }
2710  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2712  return dencidx;
2713  }
2714  }
2715  if (encoding_equal(sname, dname)) {
2716  return NIL_P(arg2) ? -1 : dencidx;
2717  }
2718  }
2719  else {
2720  if (encoding_equal(sname, dname)) {
2721  sname = "";
2722  dname = "";
2723  }
2724  }
2725 
2726  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2727  slen = RSTRING_LEN(str);
2728  blen = slen + 30; /* len + margin */
2729  dest = rb_str_tmp_new(blen);
2730  bp = (unsigned char *)RSTRING_PTR(dest);
2731 
2732  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2733  if (fromp != sp+slen) {
2734  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2735  }
2736  buf = (unsigned char *)RSTRING_PTR(dest);
2737  *bp = '\0';
2738  rb_str_set_len(dest, bp - buf);
2739 
2740  /* set encoding */
2741  if (!denc) {
2742  dencidx = rb_define_dummy_encoding(dname);
2743  RB_GC_GUARD(arg1);
2744  RB_GC_GUARD(arg2);
2745  }
2746  *self = dest;
2747 
2748  return dencidx;
2749 }
2750 
2751 static int
2752 str_transcode(int argc, VALUE *argv, VALUE *self)
2753 {
2754  VALUE opt;
2755  int ecflags = 0;
2756  VALUE ecopts = Qnil;
2757 
2758  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2759  if (!NIL_P(opt)) {
2760  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2761  }
2762  return str_transcode0(argc, argv, self, ecflags, ecopts);
2763 }
2764 
2765 static inline VALUE
2766 str_encode_associate(VALUE str, int encidx)
2767 {
2768  int cr = 0;
2769 
2770  rb_enc_associate_index(str, encidx);
2771 
2772  /* transcoded string never be broken. */
2773  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2775  }
2776  else {
2777  cr = ENC_CODERANGE_VALID;
2778  }
2779  ENC_CODERANGE_SET(str, cr);
2780  return str;
2781 }
2782 
2783 /*
2784  * call-seq:
2785  * str.encode!(encoding [, options] ) -> str
2786  * str.encode!(dst_encoding, src_encoding [, options] ) -> str
2787  *
2788  * The first form transcodes the contents of <i>str</i> from
2789  * str.encoding to +encoding+.
2790  * The second form transcodes the contents of <i>str</i> from
2791  * src_encoding to dst_encoding.
2792  * The options Hash gives details for conversion. See String#encode
2793  * for details.
2794  * Returns the string even if no changes were made.
2795  */
2796 
2797 static VALUE
2798 str_encode_bang(int argc, VALUE *argv, VALUE str)
2799 {
2800  VALUE newstr;
2801  int encidx;
2802 
2803  rb_check_frozen(str);
2804 
2805  newstr = str;
2806  encidx = str_transcode(argc, argv, &newstr);
2807 
2808  if (encidx < 0) return str;
2809  if (newstr == str) {
2810  rb_enc_associate_index(str, encidx);
2811  return str;
2812  }
2813  rb_str_shared_replace(str, newstr);
2814  return str_encode_associate(str, encidx);
2815 }
2816 
2817 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2818 
2819 /*
2820  * call-seq:
2821  * str.encode(encoding [, options] ) -> str
2822  * str.encode(dst_encoding, src_encoding [, options] ) -> str
2823  * str.encode([options]) -> str
2824  *
2825  * The first form returns a copy of +str+ transcoded
2826  * to encoding +encoding+.
2827  * The second form returns a copy of +str+ transcoded
2828  * from src_encoding to dst_encoding.
2829  * The last form returns a copy of +str+ transcoded to
2830  * <tt>Encoding.default_internal</tt>.
2831  *
2832  * By default, the first and second form raise
2833  * Encoding::UndefinedConversionError for characters that are
2834  * undefined in the destination encoding, and
2835  * Encoding::InvalidByteSequenceError for invalid byte sequences
2836  * in the source encoding. The last form by default does not raise
2837  * exceptions but uses replacement strings.
2838  *
2839  * The +options+ Hash gives details for conversion and can have the following
2840  * keys:
2841  *
2842  * :invalid ::
2843  * If the value is +:replace+, #encode replaces invalid byte sequences in
2844  * +str+ with the replacement character. The default is to raise the
2845  * Encoding::InvalidByteSequenceError exception
2846  * :undef ::
2847  * If the value is +:replace+, #encode replaces characters which are
2848  * undefined in the destination encoding with the replacement character.
2849  * The default is to raise the Encoding::UndefinedConversionError.
2850  * :replace ::
2851  * Sets the replacement string to the given value. The default replacement
2852  * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2853  * :fallback ::
2854  * Sets the replacement string by the given object for undefined
2855  * character. The object should be a Hash, a Proc, a Method, or an
2856  * object which has [] method.
2857  * Its key is an undefined character encoded in the source encoding
2858  * of current transcoder. Its value can be any encoding until it
2859  * can be converted into the destination encoding of the transcoder.
2860  * :xml ::
2861  * The value must be +:text+ or +:attr+.
2862  * If the value is +:text+ #encode replaces undefined characters with their
2863  * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2864  * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2865  * If the value is +:attr+, #encode also quotes the replacement result
2866  * (using '"'), and replaces '"' with "&quot;".
2867  * :cr_newline ::
2868  * Replaces LF ("\n") with CR ("\r") if value is true.
2869  * :crlf_newline ::
2870  * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2871  * :universal_newline ::
2872  * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2873  */
2874 
2875 static VALUE
2876 str_encode(int argc, VALUE *argv, VALUE str)
2877 {
2878  VALUE newstr = str;
2879  int encidx = str_transcode(argc, argv, &newstr);
2880  return encoded_dup(newstr, str, encidx);
2881 }
2882 
2883 VALUE
2884 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2885 {
2886  int argc = 1;
2887  VALUE *argv = &to;
2888  VALUE newstr = str;
2889  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2890  return encoded_dup(newstr, str, encidx);
2891 }
2892 
2893 static VALUE
2894 encoded_dup(VALUE newstr, VALUE str, int encidx)
2895 {
2896  if (encidx < 0) return rb_str_dup(str);
2897  if (newstr == str) {
2898  newstr = rb_str_dup(str);
2899  rb_enc_associate_index(newstr, encidx);
2900  return newstr;
2901  }
2902  else {
2903  RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2904  }
2905  return str_encode_associate(newstr, encidx);
2906 }
2907 
2908 static void
2909 econv_free(void *ptr)
2910 {
2911  rb_econv_t *ec = ptr;
2912  rb_econv_close(ec);
2913 }
2914 
2915 static size_t
2916 econv_memsize(const void *ptr)
2917 {
2918  return sizeof(rb_econv_t);
2919 }
2920 
2921 static const rb_data_type_t econv_data_type = {
2922  "econv",
2923  {NULL, econv_free, econv_memsize,},
2925 };
2926 
2927 static VALUE
2928 econv_s_allocate(VALUE klass)
2929 {
2930  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2931 }
2932 
2933 static rb_encoding *
2934 make_dummy_encoding(const char *name)
2935 {
2936  rb_encoding *enc;
2937  int idx;
2938  idx = rb_define_dummy_encoding(name);
2939  enc = rb_enc_from_index(idx);
2940  return enc;
2941 }
2942 
2943 static rb_encoding *
2944 make_encoding(const char *name)
2945 {
2946  rb_encoding *enc;
2947  enc = rb_enc_find(name);
2948  if (!enc)
2949  enc = make_dummy_encoding(name);
2950  return enc;
2951 }
2952 
2953 static VALUE
2954 make_encobj(const char *name)
2955 {
2956  return rb_enc_from_encoding(make_encoding(name));
2957 }
2958 
2959 /*
2960  * call-seq:
2961  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2962  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2963  *
2964  * Returns the corresponding ASCII compatible encoding.
2965  *
2966  * Returns nil if the argument is an ASCII compatible encoding.
2967  *
2968  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2969  * can represents exactly the same characters as the given ASCII incompatible encoding.
2970  * So, no conversion undefined error occurs when converting between the two encodings.
2971  *
2972  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2973  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2974  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2975  *
2976  */
2977 static VALUE
2978 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
2979 {
2980  const char *arg_name, *result_name;
2981  rb_encoding *arg_enc, *result_enc;
2982 
2983  enc_arg(&arg, &arg_name, &arg_enc);
2984 
2985  result_name = rb_econv_asciicompat_encoding(arg_name);
2986 
2987  if (result_name == NULL)
2988  return Qnil;
2989 
2990  result_enc = make_encoding(result_name);
2991 
2992  return rb_enc_from_encoding(result_enc);
2993 }
2994 
2995 static void
2996 econv_args(int argc, VALUE *argv,
2997  VALUE *snamev_p, VALUE *dnamev_p,
2998  const char **sname_p, const char **dname_p,
2999  rb_encoding **senc_p, rb_encoding **denc_p,
3000  int *ecflags_p,
3001  VALUE *ecopts_p)
3002 {
3003  VALUE opt, flags_v, ecopts;
3004  int sidx, didx;
3005  const char *sname, *dname;
3006  rb_encoding *senc, *denc;
3007  int ecflags;
3008 
3009  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3010 
3011  if (!NIL_P(flags_v)) {
3012  if (!NIL_P(opt)) {
3013  rb_error_arity(argc + 1, 2, 3);
3014  }
3015  ecflags = NUM2INT(rb_to_int(flags_v));
3016  ecopts = Qnil;
3017  }
3018  else if (!NIL_P(opt)) {
3019  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3020  }
3021  else {
3022  ecflags = 0;
3023  ecopts = Qnil;
3024  }
3025 
3026  senc = NULL;
3027  sidx = rb_to_encoding_index(*snamev_p);
3028  if (0 <= sidx) {
3029  senc = rb_enc_from_index(sidx);
3030  }
3031  else {
3032  StringValue(*snamev_p);
3033  }
3034 
3035  denc = NULL;
3036  didx = rb_to_encoding_index(*dnamev_p);
3037  if (0 <= didx) {
3038  denc = rb_enc_from_index(didx);
3039  }
3040  else {
3041  StringValue(*dnamev_p);
3042  }
3043 
3044  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3045  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3046 
3047  *sname_p = sname;
3048  *dname_p = dname;
3049  *senc_p = senc;
3050  *denc_p = denc;
3051  *ecflags_p = ecflags;
3052  *ecopts_p = ecopts;
3053 }
3054 
3055 static int
3056 decorate_convpath(VALUE convpath, int ecflags)
3057 {
3058  int num_decorators;
3059  const char *decorators[MAX_ECFLAGS_DECORATORS];
3060  int i;
3061  int n, len;
3062 
3063  num_decorators = decorator_names(ecflags, decorators);
3064  if (num_decorators == -1)
3065  return -1;
3066 
3067  len = n = RARRAY_LENINT(convpath);
3068  if (n != 0) {
3069  VALUE pair = RARRAY_AREF(convpath, n-1);
3070  if (RB_TYPE_P(pair, T_ARRAY)) {
3071  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3072  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3073  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3074  const rb_transcoder *tr = load_transcoder_entry(entry);
3075  if (!tr)
3076  return -1;
3077  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3079  n--;
3080  rb_ary_store(convpath, len + num_decorators - 1, pair);
3081  }
3082  }
3083  else {
3084  rb_ary_store(convpath, len + num_decorators - 1, pair);
3085  }
3086  }
3087 
3088  for (i = 0; i < num_decorators; i++)
3089  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3090 
3091  return 0;
3092 }
3093 
3094 static void
3095 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3096 {
3097  VALUE *ary_p = arg;
3098  VALUE v;
3099 
3100  if (*ary_p == Qnil) {
3101  *ary_p = rb_ary_new();
3102  }
3103 
3104  if (DECORATOR_P(sname, dname)) {
3105  v = rb_str_new_cstr(dname);
3106  }
3107  else {
3108  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3109  }
3110  rb_ary_store(*ary_p, depth, v);
3111 }
3112 
3113 /*
3114  * call-seq:
3115  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3116  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3117  *
3118  * Returns a conversion path.
3119  *
3120  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3121  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3122  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3123  *
3124  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3125  * or
3126  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3127  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3128  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3129  * # "universal_newline"]
3130  *
3131  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3132  * or
3133  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3134  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3135  * # "universal_newline",
3136  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3137  */
3138 static VALUE
3139 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3140 {
3141  VALUE snamev, dnamev;
3142  const char *sname, *dname;
3143  rb_encoding *senc, *denc;
3144  int ecflags;
3145  VALUE ecopts;
3146  VALUE convpath;
3147 
3148  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3149 
3150  convpath = Qnil;
3151  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3152 
3153  if (NIL_P(convpath))
3154  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3155 
3156  if (decorate_convpath(convpath, ecflags) == -1) {
3157  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3158  RB_GC_GUARD(snamev);
3159  RB_GC_GUARD(dnamev);
3160  rb_exc_raise(exc);
3161  }
3162 
3163  return convpath;
3164 }
3165 
3166 /*
3167  * Check the existence of a conversion path.
3168  * Returns the number of converters in the conversion path.
3169  * result: >=0:success -1:failure
3170  */
3171 int
3172 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3173 {
3174  VALUE convpath = Qnil;
3175  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3176  &convpath);
3177  return RTEST(convpath);
3178 }
3179 
3182  int index;
3183  int ret;
3184 };
3185 
3186 static void
3187 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3188 {
3189  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3190  int ret;
3191 
3192  if (a->ret == -1)
3193  return;
3194 
3195  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3196 
3197  a->ret = ret;
3198  return;
3199 }
3200 
3201 static rb_econv_t *
3202 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3203  const char **sname_p, const char **dname_p,
3204  rb_encoding **senc_p, rb_encoding**denc_p)
3205 {
3206  rb_econv_t *ec;
3207  long i;
3208  int ret, first=1;
3209  VALUE elt;
3210  rb_encoding *senc = 0, *denc = 0;
3211  const char *sname, *dname;
3212 
3213  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3214  DATA_PTR(self) = ec;
3215 
3216  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3217  VALUE snamev, dnamev;
3218  VALUE pair;
3219  elt = rb_ary_entry(convpath, i);
3220  if (!NIL_P(pair = rb_check_array_type(elt))) {
3221  if (RARRAY_LEN(pair) != 2)
3222  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3223  snamev = rb_ary_entry(pair, 0);
3224  enc_arg(&snamev, &sname, &senc);
3225  dnamev = rb_ary_entry(pair, 1);
3226  enc_arg(&dnamev, &dname, &denc);
3227  }
3228  else {
3229  sname = "";
3230  dname = StringValueCStr(elt);
3231  }
3232  if (DECORATOR_P(sname, dname)) {
3233  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3234  if (ret == -1) {
3235  VALUE msg = rb_sprintf("decoration failed: %s", dname);
3236  RB_GC_GUARD(snamev);
3237  RB_GC_GUARD(dnamev);
3239  }
3240  }
3241  else {
3242  int j = ec->num_trans;
3243  struct rb_econv_init_by_convpath_t arg;
3244  arg.ec = ec;
3245  arg.index = ec->num_trans;
3246  arg.ret = 0;
3247  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3248  if (ret == -1 || arg.ret == -1) {
3249  VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3250  RB_GC_GUARD(snamev);
3251  RB_GC_GUARD(dnamev);
3253  }
3254  if (first) {
3255  first = 0;
3256  *senc_p = senc;
3257  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3258  }
3259  *denc_p = denc;
3260  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3261  }
3262  }
3263 
3264  if (first) {
3265  *senc_p = NULL;
3266  *denc_p = NULL;
3267  *sname_p = "";
3268  *dname_p = "";
3269  }
3270 
3271  ec->source_encoding_name = *sname_p;
3272  ec->destination_encoding_name = *dname_p;
3273 
3274  return ec;
3275 }
3276 
3277 /*
3278  * call-seq:
3279  * Encoding::Converter.new(source_encoding, destination_encoding)
3280  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3281  * Encoding::Converter.new(convpath)
3282  *
3283  * possible options elements:
3284  * hash form:
3285  * :invalid => nil # raise error on invalid byte sequence (default)
3286  * :invalid => :replace # replace invalid byte sequence
3287  * :undef => nil # raise error on undefined conversion (default)
3288  * :undef => :replace # replace undefined conversion
3289  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3290  * :newline => :universal # decorator for converting CRLF and CR to LF
3291  * :newline => :crlf # decorator for converting LF to CRLF
3292  * :newline => :cr # decorator for converting LF to CR
3293  * :universal_newline => true # decorator for converting CRLF and CR to LF
3294  * :crlf_newline => true # decorator for converting LF to CRLF
3295  * :cr_newline => true # decorator for converting LF to CR
3296  * :xml => :text # escape as XML CharData.
3297  * :xml => :attr # escape as XML AttValue
3298  * integer form:
3299  * Encoding::Converter::INVALID_REPLACE
3300  * Encoding::Converter::UNDEF_REPLACE
3301  * Encoding::Converter::UNDEF_HEX_CHARREF
3302  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3303  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3304  * Encoding::Converter::CR_NEWLINE_DECORATOR
3305  * Encoding::Converter::XML_TEXT_DECORATOR
3306  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3307  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3308  *
3309  * Encoding::Converter.new creates an instance of Encoding::Converter.
3310  *
3311  * Source_encoding and destination_encoding should be a string or
3312  * Encoding object.
3313  *
3314  * opt should be nil, a hash or an integer.
3315  *
3316  * convpath should be an array.
3317  * convpath may contain
3318  * - two-element arrays which contain encodings or encoding names, or
3319  * - strings representing decorator names.
3320  *
3321  * Encoding::Converter.new optionally takes an option.
3322  * The option should be a hash or an integer.
3323  * The option hash can contain :invalid => nil, etc.
3324  * The option integer should be logical-or of constants such as
3325  * Encoding::Converter::INVALID_REPLACE, etc.
3326  *
3327  * [:invalid => nil]
3328  * Raise error on invalid byte sequence. This is a default behavior.
3329  * [:invalid => :replace]
3330  * Replace invalid byte sequence by replacement string.
3331  * [:undef => nil]
3332  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3333  * This is a default behavior.
3334  * [:undef => :replace]
3335  * Replace undefined character in destination_encoding with replacement string.
3336  * [:replace => string]
3337  * Specify the replacement string.
3338  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3339  * [:universal_newline => true]
3340  * Convert CRLF and CR to LF.
3341  * [:crlf_newline => true]
3342  * Convert LF to CRLF.
3343  * [:cr_newline => true]
3344  * Convert LF to CR.
3345  * [:xml => :text]
3346  * Escape as XML CharData.
3347  * This form can be used as an HTML 4.0 #PCDATA.
3348  * - '&' -> '&amp;'
3349  * - '<' -> '&lt;'
3350  * - '>' -> '&gt;'
3351  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3352  * [:xml => :attr]
3353  * Escape as XML AttValue.
3354  * The converted result is quoted as "...".
3355  * This form can be used as an HTML 4.0 attribute value.
3356  * - '&' -> '&amp;'
3357  * - '<' -> '&lt;'
3358  * - '>' -> '&gt;'
3359  * - '"' -> '&quot;'
3360  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3361  *
3362  * Examples:
3363  * # UTF-16BE to UTF-8
3364  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3365  *
3366  * # Usually, decorators such as newline conversion are inserted last.
3367  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3368  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3369  * # "universal_newline"]
3370  *
3371  * # But, if the last encoding is ASCII incompatible,
3372  * # decorators are inserted before the last conversion.
3373  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3374  * p ec.convpath #=> ["crlf_newline",
3375  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3376  *
3377  * # Conversion path can be specified directly.
3378  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3379  * p ec.convpath #=> ["universal_newline",
3380  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3381  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3382  */
3383 static VALUE
3384 econv_init(int argc, VALUE *argv, VALUE self)
3385 {
3386  VALUE ecopts;
3387  VALUE snamev, dnamev;
3388  const char *sname, *dname;
3389  rb_encoding *senc, *denc;
3390  rb_econv_t *ec;
3391  int ecflags;
3392  VALUE convpath;
3393 
3394  if (rb_check_typeddata(self, &econv_data_type)) {
3395  rb_raise(rb_eTypeError, "already initialized");
3396  }
3397 
3398  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3399  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3400  ecflags = 0;
3401  ecopts = Qnil;
3402  }
3403  else {
3404  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3405  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3406  }
3407 
3408  if (!ec) {
3409  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3410  RB_GC_GUARD(snamev);
3411  RB_GC_GUARD(dnamev);
3412  rb_exc_raise(exc);
3413  }
3414 
3415  if (!DECORATOR_P(sname, dname)) {
3416  if (!senc)
3417  senc = make_dummy_encoding(sname);
3418  if (!denc)
3419  denc = make_dummy_encoding(dname);
3420  RB_GC_GUARD(snamev);
3421  RB_GC_GUARD(dnamev);
3422  }
3423 
3424  ec->source_encoding = senc;
3425  ec->destination_encoding = denc;
3426 
3427  DATA_PTR(self) = ec;
3428 
3429  return self;
3430 }
3431 
3432 /*
3433  * call-seq:
3434  * ec.inspect -> string
3435  *
3436  * Returns a printable version of <i>ec</i>
3437  *
3438  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3439  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3440  *
3441  */
3442 static VALUE
3443 econv_inspect(VALUE self)
3444 {
3445  const char *cname = rb_obj_classname(self);
3446  rb_econv_t *ec;
3447 
3448  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3449  if (!ec)
3450  return rb_sprintf("#<%s: uninitialized>", cname);
3451  else {
3452  const char *sname = ec->source_encoding_name;
3453  const char *dname = ec->destination_encoding_name;
3454  VALUE str;
3455  str = rb_sprintf("#<%s: ", cname);
3456  econv_description(sname, dname, ec->flags, str);
3457  rb_str_cat2(str, ">");
3458  return str;
3459  }
3460 }
3461 
3462 static rb_econv_t *
3463 check_econv(VALUE self)
3464 {
3465  rb_econv_t *ec;
3466 
3467  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3468  if (!ec) {
3469  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3470  }
3471  return ec;
3472 }
3473 
3474 /*
3475  * call-seq:
3476  * ec.source_encoding -> encoding
3477  *
3478  * Returns the source encoding as an Encoding object.
3479  */
3480 static VALUE
3481 econv_source_encoding(VALUE self)
3482 {
3483  rb_econv_t *ec = check_econv(self);
3484  if (!ec->source_encoding)
3485  return Qnil;
3487 }
3488 
3489 /*
3490  * call-seq:
3491  * ec.destination_encoding -> encoding
3492  *
3493  * Returns the destination encoding as an Encoding object.
3494  */
3495 static VALUE
3496 econv_destination_encoding(VALUE self)
3497 {
3498  rb_econv_t *ec = check_econv(self);
3499  if (!ec->destination_encoding)
3500  return Qnil;
3502 }
3503 
3504 /*
3505  * call-seq:
3506  * ec.convpath -> ary
3507  *
3508  * Returns the conversion path of ec.
3509  *
3510  * The result is an array of conversions.
3511  *
3512  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3513  * p ec.convpath
3514  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3515  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3516  * # "crlf_newline"]
3517  *
3518  * Each element of the array is a pair of encodings or a string.
3519  * A pair means an encoding conversion.
3520  * A string means a decorator.
3521  *
3522  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3523  * a converter from ISO-8859-1 to UTF-8.
3524  * "crlf_newline" means newline converter from LF to CRLF.
3525  */
3526 static VALUE
3527 econv_convpath(VALUE self)
3528 {
3529  rb_econv_t *ec = check_econv(self);
3530  VALUE result;
3531  int i;
3532 
3533  result = rb_ary_new();
3534  for (i = 0; i < ec->num_trans; i++) {
3535  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3536  VALUE v;
3537  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3538  v = rb_str_new_cstr(tr->dst_encoding);
3539  else
3540  v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3541  rb_ary_push(result, v);
3542  }
3543  return result;
3544 }
3545 
3546 /*
3547  * call-seq:
3548  * ec == other -> true or false
3549  */
3550 static VALUE
3551 econv_equal(VALUE self, VALUE other)
3552 {
3553  rb_econv_t *ec1 = check_econv(self);
3554  rb_econv_t *ec2;
3555  int i;
3556 
3557  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3558  return Qnil;
3559  }
3560  ec2 = DATA_PTR(other);
3561  if (!ec2) return Qfalse;
3562  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3563  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3564  return Qfalse;
3567  return Qfalse;
3568  if (ec1->flags != ec2->flags) return Qfalse;
3569  if (ec1->replacement_enc != ec2->replacement_enc &&
3570  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3571  return Qfalse;
3572  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3573  if (ec1->replacement_str != ec2->replacement_str &&
3575  return Qfalse;
3576 
3577  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3578  for (i = 0; i < ec1->num_trans; i++) {
3579  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3580  return Qfalse;
3581  }
3582  return Qtrue;
3583 }
3584 
3585 static VALUE
3586 econv_result_to_symbol(rb_econv_result_t res)
3587 {
3588  switch (res) {
3589  case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3590  case econv_incomplete_input: return sym_incomplete_input;
3591  case econv_undefined_conversion: return sym_undefined_conversion;
3592  case econv_destination_buffer_full: return sym_destination_buffer_full;
3593  case econv_source_buffer_empty: return sym_source_buffer_empty;
3594  case econv_finished: return sym_finished;
3595  case econv_after_output: return sym_after_output;
3596  default: return INT2NUM(res); /* should not be reached */
3597  }
3598 }
3599 
3600 /*
3601  * call-seq:
3602  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3603  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3604  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3605  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3606  *
3607  * possible opt elements:
3608  * hash form:
3609  * :partial_input => true # source buffer may be part of larger source
3610  * :after_output => true # stop conversion after output before input
3611  * integer form:
3612  * Encoding::Converter::PARTIAL_INPUT
3613  * Encoding::Converter::AFTER_OUTPUT
3614  *
3615  * possible results:
3616  * :invalid_byte_sequence
3617  * :incomplete_input
3618  * :undefined_conversion
3619  * :after_output
3620  * :destination_buffer_full
3621  * :source_buffer_empty
3622  * :finished
3623  *
3624  * primitive_convert converts source_buffer into destination_buffer.
3625  *
3626  * source_buffer should be a string or nil.
3627  * nil means an empty string.
3628  *
3629  * destination_buffer should be a string.
3630  *
3631  * destination_byteoffset should be an integer or nil.
3632  * nil means the end of destination_buffer.
3633  * If it is omitted, nil is assumed.
3634  *
3635  * destination_bytesize should be an integer or nil.
3636  * nil means unlimited.
3637  * If it is omitted, nil is assumed.
3638  *
3639  * opt should be nil, a hash or an integer.
3640  * nil means no flags.
3641  * If it is omitted, nil is assumed.
3642  *
3643  * primitive_convert converts the content of source_buffer from beginning
3644  * and store the result into destination_buffer.
3645  *
3646  * destination_byteoffset and destination_bytesize specify the region which
3647  * the converted result is stored.
3648  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3649  * If destination_byteoffset is nil,
3650  * destination_buffer.bytesize is used for appending the result.
3651  * destination_bytesize specifies maximum number of bytes.
3652  * If destination_bytesize is nil,
3653  * destination size is unlimited.
3654  * After conversion, destination_buffer is resized to
3655  * destination_byteoffset + actually produced number of bytes.
3656  * Also destination_buffer's encoding is set to destination_encoding.
3657  *
3658  * primitive_convert drops the converted part of source_buffer.
3659  * the dropped part is converted in destination_buffer or
3660  * buffered in Encoding::Converter object.
3661  *
3662  * primitive_convert stops conversion when one of following condition met.
3663  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3664  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3665  * - unexpected end of source buffer (:incomplete_input)
3666  * this occur only when :partial_input is not specified.
3667  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3668  * - character not representable in output encoding (:undefined_conversion)
3669  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3670  * - after some output is generated, before input is done (:after_output)
3671  * this occur only when :after_output is specified.
3672  * - destination buffer is full (:destination_buffer_full)
3673  * this occur only when destination_bytesize is non-nil.
3674  * - source buffer is empty (:source_buffer_empty)
3675  * this occur only when :partial_input is specified.
3676  * - conversion is finished (:finished)
3677  *
3678  * example:
3679  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3680  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3681  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3682  *
3683  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3684  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3685  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3686  * ret = ec.primitive_convert(src, dst="", nil, 1)
3687  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3688  * ret = ec.primitive_convert(src, dst="", nil, 1)
3689  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3690  * ret = ec.primitive_convert(src, dst="", nil, 1)
3691  * p [ret, src, dst] #=> [:finished, "", "i"]
3692  *
3693  */
3694 static VALUE
3695 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3696 {
3697  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3698  rb_econv_t *ec = check_econv(self);
3699  rb_econv_result_t res;
3700  const unsigned char *ip, *is;
3701  unsigned char *op, *os;
3702  long output_byteoffset, output_bytesize;
3703  unsigned long output_byteend;
3704  int flags;
3705 
3706  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3707 
3708  if (NIL_P(output_byteoffset_v))
3709  output_byteoffset = 0; /* dummy */
3710  else
3711  output_byteoffset = NUM2LONG(output_byteoffset_v);
3712 
3713  if (NIL_P(output_bytesize_v))
3714  output_bytesize = 0; /* dummy */
3715  else
3716  output_bytesize = NUM2LONG(output_bytesize_v);
3717 
3718  if (!NIL_P(flags_v)) {
3719  if (!NIL_P(opt)) {
3720  rb_error_arity(argc + 1, 2, 5);
3721  }
3722  flags = NUM2INT(rb_to_int(flags_v));
3723  }
3724  else if (!NIL_P(opt)) {
3725  VALUE v;
3726  flags = 0;
3727  v = rb_hash_aref(opt, sym_partial_input);
3728  if (RTEST(v))
3729  flags |= ECONV_PARTIAL_INPUT;
3730  v = rb_hash_aref(opt, sym_after_output);
3731  if (RTEST(v))
3732  flags |= ECONV_AFTER_OUTPUT;
3733  }
3734  else {
3735  flags = 0;
3736  }
3737 
3738  StringValue(output);
3739  if (!NIL_P(input))
3740  StringValue(input);
3741  rb_str_modify(output);
3742 
3743  if (NIL_P(output_bytesize_v)) {
3744  output_bytesize = RSTRING_EMBED_LEN_MAX;
3745  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3746  output_bytesize = RSTRING_LEN(input);
3747  }
3748 
3749  retry:
3750 
3751  if (NIL_P(output_byteoffset_v))
3752  output_byteoffset = RSTRING_LEN(output);
3753 
3754  if (output_byteoffset < 0)
3755  rb_raise(rb_eArgError, "negative output_byteoffset");
3756 
3757  if (RSTRING_LEN(output) < output_byteoffset)
3758  rb_raise(rb_eArgError, "output_byteoffset too big");
3759 
3760  if (output_bytesize < 0)
3761  rb_raise(rb_eArgError, "negative output_bytesize");
3762 
3763  output_byteend = (unsigned long)output_byteoffset +
3764  (unsigned long)output_bytesize;
3765 
3766  if (output_byteend < (unsigned long)output_byteoffset ||
3767  LONG_MAX < output_byteend)
3768  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3769 
3770  if (rb_str_capacity(output) < output_byteend)
3771  rb_str_resize(output, output_byteend);
3772 
3773  if (NIL_P(input)) {
3774  ip = is = NULL;
3775  }
3776  else {
3777  ip = (const unsigned char *)RSTRING_PTR(input);
3778  is = ip + RSTRING_LEN(input);
3779  }
3780 
3781  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3782  os = op + output_bytesize;
3783 
3784  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3785  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3786  if (!NIL_P(input)) {
3787  OBJ_INFECT_RAW(output, input);
3788  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3789  }
3790 
3791  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3792  if (LONG_MAX / 2 < output_bytesize)
3793  rb_raise(rb_eArgError, "too long conversion result");
3794  output_bytesize *= 2;
3795  output_byteoffset_v = Qnil;
3796  goto retry;
3797  }
3798 
3799  if (ec->destination_encoding) {
3801  }
3802 
3803  return econv_result_to_symbol(res);
3804 }
3805 
3806 /*
3807  * call-seq:
3808  * ec.convert(source_string) -> destination_string
3809  *
3810  * Convert source_string and return destination_string.
3811  *
3812  * source_string is assumed as a part of source.
3813  * i.e. :partial_input=>true is specified internally.
3814  * finish method should be used last.
3815  *
3816  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3817  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3818  * puts ec.finish.dump #=> ""
3819  *
3820  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3821  * puts ec.convert("\xA4").dump #=> ""
3822  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3823  * puts ec.finish.dump #=> ""
3824  *
3825  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3826  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3827  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3828  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3829  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3830  *
3831  * If a conversion error occur,
3832  * Encoding::UndefinedConversionError or
3833  * Encoding::InvalidByteSequenceError is raised.
3834  * Encoding::Converter#convert doesn't supply methods to recover or restart
3835  * from these exceptions.
3836  * When you want to handle these conversion errors,
3837  * use Encoding::Converter#primitive_convert.
3838  *
3839  */
3840 static VALUE
3841 econv_convert(VALUE self, VALUE source_string)
3842 {
3843  VALUE ret, dst;
3844  VALUE av[5];
3845  int ac;
3846  rb_econv_t *ec = check_econv(self);
3847 
3848  StringValue(source_string);
3849 
3850  dst = rb_str_new(NULL, 0);
3851 
3852  av[0] = rb_str_dup(source_string);
3853  av[1] = dst;
3854  av[2] = Qnil;
3855  av[3] = Qnil;
3856  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3857  ac = 5;
3858 
3859  ret = econv_primitive_convert(ac, av, self);
3860 
3861  if (ret == sym_invalid_byte_sequence ||
3862  ret == sym_undefined_conversion ||
3863  ret == sym_incomplete_input) {
3864  VALUE exc = make_econv_exception(ec);
3865  rb_exc_raise(exc);
3866  }
3867 
3868  if (ret == sym_finished) {
3869  rb_raise(rb_eArgError, "converter already finished");
3870  }
3871 
3872  if (ret != sym_source_buffer_empty) {
3873  rb_bug("unexpected result of econv_primitive_convert");
3874  }
3875 
3876  return dst;
3877 }
3878 
3879 /*
3880  * call-seq:
3881  * ec.finish -> string
3882  *
3883  * Finishes the converter.
3884  * It returns the last part of the converted string.
3885  *
3886  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3887  * p ec.convert("\u3042") #=> "\e$B$\""
3888  * p ec.finish #=> "\e(B"
3889  */
3890 static VALUE
3891 econv_finish(VALUE self)
3892 {
3893  VALUE ret, dst;
3894  VALUE av[5];
3895  int ac;
3896  rb_econv_t *ec = check_econv(self);
3897 
3898  dst = rb_str_new(NULL, 0);
3899 
3900  av[0] = Qnil;
3901  av[1] = dst;
3902  av[2] = Qnil;
3903  av[3] = Qnil;
3904  av[4] = INT2FIX(0);
3905  ac = 5;
3906 
3907  ret = econv_primitive_convert(ac, av, self);
3908 
3909  if (ret == sym_invalid_byte_sequence ||
3910  ret == sym_undefined_conversion ||
3911  ret == sym_incomplete_input) {
3912  VALUE exc = make_econv_exception(ec);
3913  rb_exc_raise(exc);
3914  }
3915 
3916  if (ret != sym_finished) {
3917  rb_bug("unexpected result of econv_primitive_convert");
3918  }
3919 
3920  return dst;
3921 }
3922 
3923 /*
3924  * call-seq:
3925  * ec.primitive_errinfo -> array
3926  *
3927  * primitive_errinfo returns important information regarding the last error
3928  * as a 5-element array:
3929  *
3930  * [result, enc1, enc2, error_bytes, readagain_bytes]
3931  *
3932  * result is the last result of primitive_convert.
3933  *
3934  * Other elements are only meaningful when result is
3935  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3936  *
3937  * enc1 and enc2 indicate a conversion step as a pair of strings.
3938  * For example, a converter from EUC-JP to ISO-8859-1 converts
3939  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3940  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3941  *
3942  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3943  * error_bytes is discarded portion.
3944  * readagain_bytes is buffered portion which is read again on next conversion.
3945  *
3946  * Example:
3947  *
3948  * # \xff is invalid as EUC-JP.
3949  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3950  * ec.primitive_convert(src="\xff", dst="", nil, 10)
3951  * p ec.primitive_errinfo
3952  * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3953  *
3954  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3955  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3956  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3957  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3958  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3959  * p ec.primitive_errinfo
3960  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3961  *
3962  * # partial character is invalid
3963  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3964  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3965  * p ec.primitive_errinfo
3966  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3967  *
3968  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3969  * # partial characters.
3970  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3971  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3972  * p ec.primitive_errinfo
3973  * #=> [:source_buffer_empty, nil, nil, nil, nil]
3974  *
3975  * # \xd8\x00\x00@ is invalid as UTF-16BE because
3976  * # no low surrogate after high surrogate (\xd8\x00).
3977  * # It is detected by 3rd byte (\00) which is part of next character.
3978  * # So the high surrogate (\xd8\x00) is discarded and
3979  * # the 3rd byte is read again later.
3980  * # Since the byte is buffered in ec, it is dropped from src.
3981  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3982  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3983  * p ec.primitive_errinfo
3984  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
3985  * p src
3986  * #=> "@"
3987  *
3988  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
3989  * # The problem is detected by 4th byte.
3990  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
3991  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
3992  * p ec.primitive_errinfo
3993  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
3994  * p src
3995  * #=> ""
3996  *
3997  */
3998 static VALUE
3999 econv_primitive_errinfo(VALUE self)
4000 {
4001  rb_econv_t *ec = check_econv(self);
4002 
4003  VALUE ary;
4004 
4005  ary = rb_ary_new2(5);
4006 
4007  rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4008  rb_ary_store(ary, 4, Qnil);
4009 
4010  if (ec->last_error.source_encoding)
4012 
4015 
4016  if (ec->last_error.error_bytes_start) {
4019  }
4020 
4021  return ary;
4022 }
4023 
4024 /*
4025  * call-seq:
4026  * ec.insert_output(string) -> nil
4027  *
4028  * Inserts string into the encoding converter.
4029  * The string will be converted to the destination encoding and
4030  * output on later conversions.
4031  *
4032  * If the destination encoding is stateful,
4033  * string is converted according to the state and the state is updated.
4034  *
4035  * This method should be used only when a conversion error occurs.
4036  *
4037  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4038  * src = "HIRAGANA LETTER A is \u{3042}."
4039  * dst = ""
4040  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4041  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4042  * ec.insert_output("<err>")
4043  * p ec.primitive_convert(src, dst) #=> :finished
4044  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4045  *
4046  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4047  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4048  * dst = ""
4049  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4050  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4051  * ec.insert_output "?" # state change required to output "?".
4052  * p ec.primitive_convert(src, dst) #=> :finished
4053  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4054  *
4055  */
4056 static VALUE
4057 econv_insert_output(VALUE self, VALUE string)
4058 {
4059  const char *insert_enc;
4060 
4061  int ret;
4062 
4063  rb_econv_t *ec = check_econv(self);
4064 
4065  StringValue(string);
4066  insert_enc = rb_econv_encoding_to_insert_output(ec);
4067  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4068 
4069  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4070  if (ret == -1) {
4071  rb_raise(rb_eArgError, "too big string");
4072  }
4073 
4074  return Qnil;
4075 }
4076 
4077 /*
4078  * call-seq
4079  * ec.putback -> string
4080  * ec.putback(max_numbytes) -> string
4081  *
4082  * Put back the bytes which will be converted.
4083  *
4084  * The bytes are caused by invalid_byte_sequence error.
4085  * When invalid_byte_sequence error, some bytes are discarded and
4086  * some bytes are buffered to be converted later.
4087  * The latter bytes can be put back.
4088  * It can be observed by
4089  * Encoding::InvalidByteSequenceError#readagain_bytes and
4090  * Encoding::Converter#primitive_errinfo.
4091  *
4092  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4093  * src = "\x00\xd8\x61\x00"
4094  * dst = ""
4095  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4096  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4097  * p ec.putback #=> "a\x00"
4098  * p ec.putback #=> "" # no more bytes to put back
4099  *
4100  */
4101 static VALUE
4102 econv_putback(int argc, VALUE *argv, VALUE self)
4103 {
4104  rb_econv_t *ec = check_econv(self);
4105  int n;
4106  int putbackable;
4107  VALUE str, max;
4108 
4109  rb_scan_args(argc, argv, "01", &max);
4110 
4111  if (NIL_P(max))
4112  n = rb_econv_putbackable(ec);
4113  else {
4114  n = NUM2INT(max);
4115  putbackable = rb_econv_putbackable(ec);
4116  if (putbackable < n)
4117  n = putbackable;
4118  }
4119 
4120  str = rb_str_new(NULL, n);
4121  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4122 
4123  if (ec->source_encoding) {
4125  }
4126 
4127  return str;
4128 }
4129 
4130 /*
4131  * call-seq:
4132  * ec.last_error -> exception or nil
4133  *
4134  * Returns an exception object for the last conversion.
4135  * Returns nil if the last conversion did not produce an error.
4136  *
4137  * "error" means that
4138  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4139  * Encoding::Converter#convert and
4140  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4141  * Encoding::Converter#primitive_convert.
4142  *
4143  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4144  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4145  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4146  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4147  * p ec.last_error #=> nil
4148  *
4149  */
4150 static VALUE
4151 econv_last_error(VALUE self)
4152 {
4153  rb_econv_t *ec = check_econv(self);
4154  VALUE exc;
4155 
4156  exc = make_econv_exception(ec);
4157  if (NIL_P(exc))
4158  return Qnil;
4159  return exc;
4160 }
4161 
4162 /*
4163  * call-seq:
4164  * ec.replacement -> string
4165  *
4166  * Returns the replacement string.
4167  *
4168  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4169  * p ec.replacement #=> "?"
4170  *
4171  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4172  * p ec.replacement #=> "\uFFFD"
4173  */
4174 static VALUE
4175 econv_get_replacement(VALUE self)
4176 {
4177  rb_econv_t *ec = check_econv(self);
4178  int ret;
4179  rb_encoding *enc;
4180 
4181  ret = make_replacement(ec);
4182  if (ret == -1) {
4183  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4184  }
4185 
4186  enc = rb_enc_find(ec->replacement_enc);
4187  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4188 }
4189 
4190 /*
4191  * call-seq:
4192  * ec.replacement = string
4193  *
4194  * Sets the replacement string.
4195  *
4196  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4197  * ec.replacement = "<undef>"
4198  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4199  */
4200 static VALUE
4201 econv_set_replacement(VALUE self, VALUE arg)
4202 {
4203  rb_econv_t *ec = check_econv(self);
4204  VALUE string = arg;
4205  int ret;
4206  rb_encoding *enc;
4207 
4208  StringValue(string);
4209  enc = rb_enc_get(string);
4210 
4211  ret = rb_econv_set_replacement(ec,
4212  (const unsigned char *)RSTRING_PTR(string),
4213  RSTRING_LEN(string),
4214  rb_enc_name(enc));
4215 
4216  if (ret == -1) {
4217  /* xxx: rb_eInvalidByteSequenceError? */
4218  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4219  }
4220 
4221  return arg;
4222 }
4223 
4224 VALUE
4226 {
4227  return make_econv_exception(ec);
4228 }
4229 
4230 void
4232 {
4233  VALUE exc;
4234 
4235  exc = make_econv_exception(ec);
4236  if (NIL_P(exc))
4237  return;
4238  rb_exc_raise(exc);
4239 }
4240 
4241 /*
4242  * call-seq:
4243  * ecerr.source_encoding_name -> string
4244  *
4245  * Returns the source encoding name as a string.
4246  */
4247 static VALUE
4248 ecerr_source_encoding_name(VALUE self)
4249 {
4250  return rb_attr_get(self, rb_intern("source_encoding_name"));
4251 }
4252 
4253 /*
4254  * call-seq:
4255  * ecerr.source_encoding -> encoding
4256  *
4257  * Returns the source encoding as an encoding object.
4258  *
4259  * Note that the result may not be equal to the source encoding of
4260  * the encoding converter if the conversion has multiple steps.
4261  *
4262  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4263  * begin
4264  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4265  * rescue Encoding::UndefinedConversionError
4266  * p $!.source_encoding #=> #<Encoding:UTF-8>
4267  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4268  * p $!.source_encoding_name #=> "UTF-8"
4269  * p $!.destination_encoding_name #=> "EUC-JP"
4270  * end
4271  *
4272  */
4273 static VALUE
4274 ecerr_source_encoding(VALUE self)
4275 {
4276  return rb_attr_get(self, rb_intern("source_encoding"));
4277 }
4278 
4279 /*
4280  * call-seq:
4281  * ecerr.destination_encoding_name -> string
4282  *
4283  * Returns the destination encoding name as a string.
4284  */
4285 static VALUE
4286 ecerr_destination_encoding_name(VALUE self)
4287 {
4288  return rb_attr_get(self, rb_intern("destination_encoding_name"));
4289 }
4290 
4291 /*
4292  * call-seq:
4293  * ecerr.destination_encoding -> string
4294  *
4295  * Returns the destination encoding as an encoding object.
4296  */
4297 static VALUE
4298 ecerr_destination_encoding(VALUE self)
4299 {
4300  return rb_attr_get(self, rb_intern("destination_encoding"));
4301 }
4302 
4303 /*
4304  * call-seq:
4305  * ecerr.error_char -> string
4306  *
4307  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4308  *
4309  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4310  * begin
4311  * ec.convert("\xa0")
4312  * rescue Encoding::UndefinedConversionError
4313  * puts $!.error_char.dump #=> "\xC2\xA0"
4314  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4315  * end
4316  *
4317  */
4318 static VALUE
4319 ecerr_error_char(VALUE self)
4320 {
4321  return rb_attr_get(self, rb_intern("error_char"));
4322 }
4323 
4324 /*
4325  * call-seq:
4326  * ecerr.error_bytes -> string
4327  *
4328  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4329  *
4330  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4331  * begin
4332  * ec.convert("abc\xA1\xFFdef")
4333  * rescue Encoding::InvalidByteSequenceError
4334  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4335  * puts $!.error_bytes.dump #=> "\xA1"
4336  * puts $!.readagain_bytes.dump #=> "\xFF"
4337  * end
4338  */
4339 static VALUE
4340 ecerr_error_bytes(VALUE self)
4341 {
4342  return rb_attr_get(self, rb_intern("error_bytes"));
4343 }
4344 
4345 /*
4346  * call-seq:
4347  * ecerr.readagain_bytes -> string
4348  *
4349  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4350  */
4351 static VALUE
4352 ecerr_readagain_bytes(VALUE self)
4353 {
4354  return rb_attr_get(self, rb_intern("readagain_bytes"));
4355 }
4356 
4357 /*
4358  * call-seq:
4359  * ecerr.incomplete_input? -> true or false
4360  *
4361  * Returns true if the invalid byte sequence error is caused by
4362  * premature end of string.
4363  *
4364  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4365  *
4366  * begin
4367  * ec.convert("abc\xA1z")
4368  * rescue Encoding::InvalidByteSequenceError
4369  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4370  * p $!.incomplete_input? #=> false
4371  * end
4372  *
4373  * begin
4374  * ec.convert("abc\xA1")
4375  * ec.finish
4376  * rescue Encoding::InvalidByteSequenceError
4377  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4378  * p $!.incomplete_input? #=> true
4379  * end
4380  */
4381 static VALUE
4382 ecerr_incomplete_input(VALUE self)
4383 {
4384  return rb_attr_get(self, rb_intern("incomplete_input"));
4385 }
4386 
4387 /*
4388  * Document-class: Encoding::UndefinedConversionError
4389  *
4390  * Raised by Encoding and String methods when a transcoding operation
4391  * fails.
4392  */
4393 
4394 /*
4395  * Document-class: Encoding::InvalidByteSequenceError
4396  *
4397  * Raised by Encoding and String methods when the string being
4398  * transcoded contains a byte invalid for the either the source or
4399  * target encoding.
4400  */
4401 
4402 /*
4403  * Document-class: Encoding::ConverterNotFoundError
4404  *
4405  * Raised by transcoding methods when a named encoding does not
4406  * correspond with a known converter.
4407  */
4408 
4409 #undef rb_intern
4410 void
4412 {
4413  transcoder_table = st_init_strcasetable();
4414 
4415  sym_invalid = ID2SYM(rb_intern("invalid"));
4416  sym_undef = ID2SYM(rb_intern("undef"));
4417  sym_replace = ID2SYM(rb_intern("replace"));
4418  sym_fallback = ID2SYM(rb_intern("fallback"));
4419  sym_aref = ID2SYM(rb_intern("[]"));
4420  sym_xml = ID2SYM(rb_intern("xml"));
4421  sym_text = ID2SYM(rb_intern("text"));
4422  sym_attr = ID2SYM(rb_intern("attr"));
4423 
4424  sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4425  sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4426  sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4427  sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4428  sym_finished = ID2SYM(rb_intern("finished"));
4429  sym_after_output = ID2SYM(rb_intern("after_output"));
4430  sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4431  sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4432  sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4433  sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4434  sym_partial_input = ID2SYM(rb_intern("partial_input"));
4435 
4436 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4437  sym_newline = ID2SYM(rb_intern("newline"));
4438  sym_universal = ID2SYM(rb_intern("universal"));
4439  sym_crlf = ID2SYM(rb_intern("crlf"));
4440  sym_cr = ID2SYM(rb_intern("cr"));
4441  sym_lf = ID2SYM(rb_intern("lf"));
4442 #endif
4443 
4444  InitVM(transcode);
4445 }
4446 
4447 void
4449 {
4453 
4454  rb_define_method(rb_cString, "encode", str_encode, -1);
4455  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4456 
4458  rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4459  rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4460  rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4461  rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4462  rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4463  rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4464  rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4465  rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4466  rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4467  rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4468  rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4469  rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4470  rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4471  rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4472  rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4473  rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4474  rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4475  rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4476 
4477  /* Document-const: INVALID_MASK
4478  *
4479  * Mask for invalid byte sequences
4480  */
4482 
4483  /* Document-const: INVALID_REPLACE
4484  *
4485  * Replace invalid byte sequences
4486  */
4488 
4489  /* Document-const: UNDEF_MASK
4490  *
4491  * Mask for a valid character in the source encoding but no related
4492  * character(s) in destination encoding.
4493  */
4495 
4496  /* Document-const: UNDEF_REPLACE
4497  *
4498  * Replace byte sequences that are undefined in the destination encoding.
4499  */
4501 
4502  /* Document-const: UNDEF_HEX_CHARREF
4503  *
4504  * Replace byte sequences that are undefined in the destination encoding
4505  * with an XML hexadecimal character reference. This is valid for XML
4506  * conversion.
4507  */
4509 
4510  /* Document-const: PARTIAL_INPUT
4511  *
4512  * Indicates the source may be part of a larger string. See
4513  * primitive_convert for an example.
4514  */
4516 
4517  /* Document-const: AFTER_OUTPUT
4518  *
4519  * Stop converting after some output is complete but before all of the
4520  * input was consumed. See primitive_convert for an example.
4521  */
4523 
4524  /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4525  *
4526  * Decorator for converting CRLF and CR to LF
4527  */
4529 
4530  /* Document-const: CRLF_NEWLINE_DECORATOR
4531  *
4532  * Decorator for converting LF to CRLF
4533  */
4535 
4536  /* Document-const: CR_NEWLINE_DECORATOR
4537  *
4538  * Decorator for converting LF to CR
4539  */
4541 
4542  /* Document-const: XML_TEXT_DECORATOR
4543  *
4544  * Escape as XML CharData
4545  */
4547 
4548  /* Document-const: XML_ATTR_CONTENT_DECORATOR
4549  *
4550  * Escape as XML AttValue
4551  */
4553 
4554  /* Document-const: XML_ATTR_QUOTE_DECORATOR
4555  *
4556  * Escape as XML AttValue
4557  */
4559 
4560  rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4561  rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4562  rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4563  rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4564  rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4565 
4566  rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4567  rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4568  rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4569  rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4570  rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4571  rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4572  rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4573 
4574  Init_newline();
4575 }
RUBY_EXTERN VALUE rb_cString
Definition: ruby.h:1927
#define BL_ACTION(byte)
#define FOURbt
const char * ascii_incompat_name
Definition: transcode.c:1761
unsigned char ary[8]
Definition: transcode.c:65
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2569
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:399
#define T_SYMBOL
Definition: ruby.h:508
Definition: string.c:6519
#define FUNio
search_path_queue_t * queue
Definition: transcode.c:248
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:773
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4231
VALUE next_info
Definition: transcode.c:58
RUBY_EXTERN VALUE rb_cData
Definition: ruby.h:1902
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:185
#define NOMAP
VALUE rb_eConverterNotFoundError
Definition: transcode.c:21
void rb_bug(const char *fmt,...)
Definition: error.c:521
rb_econv_result_t
Definition: encoding.h:291
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:1215
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:186
unsigned char * in_buf_end
Definition: transcode.c:123
const unsigned char * error_bytes_start
Definition: transcode.c:137
#define RARRAY_LEN(a)
Definition: ruby.h:1019
rb_econv_result_t last_result
Definition: transcode.c:106
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:202
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4225
#define FALSE
Definition: nkf.h:174
#define RUBY_TYPED_FREE_IMMEDIATELY
Definition: ruby.h:1138
const char * dst_encoding
rb_econv_result_t result
Definition: transcode.c:133
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:531
size_t strlen(const char *)
#define INT2NUM(x)
Definition: ruby.h:1538
struct search_path_queue_tag search_path_queue_t
#define DECORATOR_P(sname, dname)
Definition: transcode.c:152
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:9645
Definition: st.h:79
#define GB4bt
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1442
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2033
Definition: st.h:99
VALUE rb_cEncoding
Definition: encoding.c:45
VALUE rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
Definition: transcode.c:1809
#define NUM2INT(x)
Definition: ruby.h:684
#define ZERObt
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1716
VALUE rb_eInvalidByteSequenceError
Definition: transcode.c:20
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:400
#define getGB4bt1(a)
#define FL_TAINT
Definition: ruby.h:1213
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1939
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2284
ssize_t writebuf_len
Definition: transcode.c:70
#define InitVM(ext)
Definition: ruby.h:2164
#define st_foreach
Definition: regint.h:186
rb_encoding * source_encoding
Definition: transcode.c:144
#define Qtrue
Definition: ruby.h:437
unsigned char * out_data_start
Definition: transcode.c:103
void Init_newline(void)
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: ruby.h:1162
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1022
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:106
Definition: st.h:99
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: ruby.h:1183
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2524
#define OBJ_FREEZE(x)
Definition: ruby.h:1306
unsigned char * in_data_start
Definition: transcode.c:121
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:386
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1922
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:246
#define getBT3(a)
rb_encoding * destination_encoding
Definition: transcode.c:145
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:402
struct rb_transcoding * tc
Definition: transcode.c:101
#define SUSPEND(ret, num)
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:117
#define rb_check_arity
Definition: intern.h:298
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1852
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:924
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:1338
#define UNDEF
struct rb_transcoding * error_tc
Definition: transcode.c:134
void rb_str_set_len(VALUE, long)
Definition: string.c:2627
#define RBASIC_SET_CLASS(obj, cls)
Definition: internal.h:1471
int rb_enc_str_coderange(VALUE)
Definition: string.c:621
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:693
ssize_t(* func_sio)(void *, const unsigned char *, size_t, VALUE, unsigned char *, size_t)
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:854
unsigned int conv_tree_start
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Definition: error.c:848
#define RB_GC_GUARD(v)
Definition: ruby.h:552
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define T_HASH
Definition: ruby.h:499
const char * lib
Definition: transcode.c:157
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2884
#define THREEbt
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2575
#define STR1
#define DATA_PTR(dta)
Definition: ruby.h:1106
const rb_transcoder * transcoder
Definition: transcode.c:158
#define FL_UNSET(x, f)
Definition: ruby.h:1290
#define T_ARRAY
Definition: ruby.h:498
#define st_lookup
Definition: regint.h:185
VALUE(* func_ii)(void *, VALUE)
const char * dname
Definition: transcode.c:156
int(* state_init_func)(void *)
ssize_t(* func_so)(void *, const unsigned char *, size_t, unsigned char *, size_t)
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1320
VALUE rb_eEncodingError
Definition: error.c:807
VALUE rb_str_tmp_new(long)
Definition: string.c:1310
VALUE(* func_si)(void *, const unsigned char *, size_t)
unsigned char * in_buf_start
Definition: transcode.c:120
const char * enc
Definition: transcode.c:243
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1749
#define FUNsio
#define ENC_CODERANGE_7BIT
Definition: encoding.h:100
size_t error_bytes_len
Definition: transcode.c:138
const char * rb_obj_classname(VALUE)
Definition: variable.c:459
#define rb_ary_new2
Definition: intern.h:90
#define getGB4bt2(a)
VALUE rb_eArgError
Definition: error.c:802
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:759
ssize_t(* finish_func)(void *, unsigned char *, size_t)
RUBY_SYMBOL_EXPORT_BEGIN typedef unsigned long st_data_t
Definition: st.h:22
st_table * st_init_strcasetable(void)
Definition: st.c:640
#define FUNii
st_table * visited
Definition: transcode.c:247
VALUE rb_obj_class(VALUE)
call-seq: obj.class -> class
Definition: object.c:277
#define RB_TYPE_P(obj, type)
Definition: ruby.h:527
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3172
#define fail()
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:198
ssize_t readagain_len
Definition: transcode.c:63
unsigned int output_index
Definition: transcode.c:60
unsigned int input
Definition: nkf.c:4312
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:82
#define ALLOC_N(type, n)
Definition: ruby.h:1587
void Init_transcode(void)
Definition: transcode.c:4411
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:1616
unsigned char * in_data_end
Definition: transcode.c:122
Definition: transcode.c:154
int num_finished
Definition: transcode.c:128
const char * destination_encoding
Definition: transcode.c:136
#define val
int resume_position
Definition: transcode.c:56
#define ECONV_INVALID_MASK
Definition: encoding.h:387
#define RSTRING_END(str)
Definition: ruby.h:979
struct rb_econv_t rb_econv_t
Definition: encoding.h:301
#define SUSPEND_AFTER_OUTPUT(num)
#define getGB4bt3(a)
VALUE rb_str_cat2(VALUE, const char *)
#define ECONV_INVALID_REPLACE
Definition: encoding.h:388
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1698
VALUE rb_ary_new(void)
Definition: array.c:499
int(* state_fini_func)(void *)
#define dp(v)
Definition: vm_debug.h:21
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:409
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:410
#define snprintf
Definition: subst.h:6
#define NIL_P(v)
Definition: ruby.h:451
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2691
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:815
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1862
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1583
const char * sname
Definition: transcode.c:155
int argc
Definition: ruby.c:187
#define Qfalse
Definition: ruby.h:436
VALUE rb_cEncodingConverter
Definition: transcode.c:23
VALUE rb_require_safe(VALUE, int)
Definition: load.c:1045
#define ALLOCA_N(type, n)
Definition: ruby.h:1593
#define TRANSCODING_STATE(tc)
Definition: transcode.c:95
#define LONG_MAX
Definition: ruby.h:189
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1661
ssize_t(* func_io)(void *, VALUE, const unsigned char *, size_t)
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:102
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:826
#define rb_ary_new4
Definition: intern.h:92
#define rb_str_new2
Definition: intern.h:835
int err
Definition: win32.c:135
rb_transcoder_asciicompat_type_t asciicompat_type
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:231
#define PRIdPTRDIFF
Definition: ruby.h:159
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1905
#define ENC_CODERANGE_VALID
Definition: encoding.h:101
#define ECONV_UNDEF_MASK
Definition: encoding.h:389
#define ALLOC(type)
Definition: ruby.h:1588
#define SUSPEND_OBUF(num)
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2644
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:203
union rb_transcoding::@120 readbuf
unsigned char * out_buf_start
Definition: transcode.c:102
#define getGB4bt0(a)
ssize_t recognized_len
Definition: transcode.c:62
int num_trans
Definition: transcode.c:127
#define FUNso
#define RSTRING_LEN(str)
Definition: ruby.h:971
int num_additional
Definition: transcode.c:956
#define REALLOC_N(var, type, n)
Definition: ruby.h:1591
#define TRUE
Definition: nkf.h:175
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:116
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:249
VALUE rb_sprintf(const char *format,...)
Definition: sprintf.c:1452
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1020
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1738
#define rb_enc_name(enc)
Definition: encoding.h:171
unsigned char * out_buf_end
Definition: transcode.c:105
unsigned char next_byte
Definition: transcode.c:59
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2195
struct rb_transcoding * last_tc
Definition: transcode.c:129
#define MEMMOVE(p1, p2, type, n)
Definition: ruby.h:1662
#define STR1_BYTEINDEX(w)
VALUE rb_hash_new(void)
Definition: hash.c:424
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1908
const char * base_enc
Definition: transcode.c:250
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1315
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:722
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4309
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Definition: array.c:639
#define PRIsVALUE
Definition: ruby.h:135
char ary[sizeof(double) > sizeof(void *) ? sizeof(double) :sizeof(void *)]
Definition: transcode.c:78
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:397
const char * source_encoding
Definition: transcode.c:135
#define Qnil
Definition: ruby.h:438
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:615
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:466
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1868
unsigned long VALUE
Definition: ruby.h:85
union rb_transcoding::rb_transcoding_state_t state
VALUE rb_eTypeError
Definition: error.c:801
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:393
const char * src_encoding
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:992
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:391
#define getBT1(a)
#define rb_enc_asciicompat(enc)
Definition: encoding.h:239
VALUE rb_str_new_cstr(const char *)
Definition: string.c:771
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
VALUE rb_str_dump(VALUE)
Definition: string.c:5920
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:872
const char * ascii_compat_name
Definition: transcode.c:1760
unsigned char * ptr
Definition: transcode.c:66
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:398
#define RARRAY_LENINT(ary)
Definition: ruby.h:1020
VALUE rb_str_dup(VALUE)
Definition: string.c:1488
struct rb_econv_t::@122 last_error
VALUE rb_hash_freeze(VALUE hash)
Definition: hash.c:77
#define FUNsi
#define INVALID
#define BL_MIN_BYTE
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1994
VALUE rb_method_call(int, const VALUE *, VALUE)
Definition: proc.c:2075
register unsigned int len
Definition: zonetab.h:51
#define StringValueCStr(v)
Definition: ruby.h:571
#define RSTRING_PTR(str)
Definition: ruby.h:975
#define ONEbt
#define rb_exc_new3
Definition: intern.h:244
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:390
void rb_str_modify(VALUE)
Definition: string.c:2046
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:860
int size
Definition: encoding.c:57
#define f
#define INT2FIX(i)
Definition: ruby.h:232
int rb_safe_level(void)
Definition: safe.c:35
#define RARRAY_AREF(a, i)
Definition: ruby.h:1033
unsigned char * out_data_end
Definition: transcode.c:104
#define xmalloc
Definition: defines.h:183
#define SIZE_MAX
Definition: ruby.h:276
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1716
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1063
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:86
VALUE rb_eRuntimeError
Definition: error.c:800
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:651
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:831
void rb_error_arity(int argc, int min, int max)
VALUE rb_str_catf(VALUE str, const char *format,...)
Definition: sprintf.c:1492
#define rb_funcall3
Definition: ruby.h:1792
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:1358
#define RTEST(v)
Definition: ruby.h:450
unsigned int next_table
Definition: transcode.c:57
size_t readagain_len
Definition: transcode.c:139
size_t rb_str_capacity(VALUE str)
Definition: string.c:675
#define st_add_direct
Definition: regint.h:187
#define getBT2(a)
#define OBJ_FROZEN(x)
Definition: ruby.h:1304
void InitVM_transcode(void)
Definition: transcode.c:4448
int num_allocated
Definition: transcode.c:126
#define BYTE_ADDR(index)
const char * destination_encoding_name
Definition: transcode.c:114
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1519
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1782
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:759
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1499
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1874
#define hash_fallback
Definition: transcode.c:2242
const char * name
Definition: nkf.c:208
#define xrealloc
Definition: defines.h:186
#define ID2SYM(x)
Definition: ruby.h:383
VALUE rb_eUndefinedConversionError
Definition: transcode.c:19
int started
Definition: transcode.c:111
rb_econv_elem_t * elems
Definition: transcode.c:124
const char * replacement_enc
Definition: transcode.c:118
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:1158
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:4473
const char * source_encoding_name
Definition: transcode.c:113
#define st_free_table
Definition: regint.h:188
size_t replacement_len
Definition: transcode.c:117
int replacement_allocated
Definition: transcode.c:125
#define BL_MAX_BYTE
struct search_path_queue_tag * next
Definition: transcode.c:242
int rb_enc_find_index(const char *name)
Definition: encoding.c:704
#define rb_check_frozen(obj)
Definition: intern.h:271
union rb_transcoding::@121 writebuf
#define getBT0(a)
void void xfree(void *)
const rb_transcoder * transcoder
Definition: transcode.c:52
#define rb_intern(str)
ssize_t writebuf_off
Definition: transcode.c:69
VALUE rb_str_buf_new(long)
Definition: string.c:1282
#define SYMBOL_P(x)
Definition: ruby.h:382
#define TWObt
#define OBJ_INFECT_RAW(x, s)
Definition: ruby.h:1301
#define NULL
Definition: _sdbm.c:102
struct rb_transcoding rb_transcoding
#define Qundef
Definition: ruby.h:439
st_index_t num_entries
Definition: st.h:86
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1515
const unsigned char * replacement_str
Definition: transcode.c:116
#define bp()
Definition: vm_debug.h:25
#define STR1_LENGTH(byte_addr)
#define encoding_equal(enc1, enc2)
Definition: transcode.c:239
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:90
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:396
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:728
#define NUM2LONG(x)
Definition: ruby.h:648
transcoder_entry_t ** entries
Definition: transcode.c:955
VALUE rb_to_int(VALUE)
Converts val into Integer.
Definition: object.c:3084
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1224
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:769
char ** argv
Definition: ruby.c:188
#define StringValue(v)
Definition: ruby.h:569
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:616
#define rb_sym2str(sym)
Definition: console.c:107
VALUE rb_str_new(const char *, long)
Definition: string.c:737