Ruby  2.5.0dev(2017-10-22revision60238)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author$
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "internal.h"
15 #include "ruby/re.h"
16 #include "encindex.h"
17 #include "probes.h"
18 #include "gc.h"
19 #include "ruby_assert.h"
20 #include "id.h"
21 #include "debug_counter.h"
22 
23 #define BEG(no) (regs->beg[(no)])
24 #define END(no) (regs->end[(no)])
25 
26 #include <math.h>
27 #include <ctype.h>
28 
29 #ifdef HAVE_UNISTD_H
30 #include <unistd.h>
31 #endif
32 
33 #if defined HAVE_CRYPT_R
34 # if defined HAVE_CRYPT_H
35 # include <crypt.h>
36 # endif
37 #elif !defined HAVE_CRYPT
38 # include "missing/crypt.h"
39 # define HAVE_CRYPT_R 1
40 #endif
41 
42 #define STRING_ENUMERATORS_WANTARRAY 0 /* next major */
43 
44 #undef rb_str_new
45 #undef rb_usascii_str_new
46 #undef rb_utf8_str_new
47 #undef rb_enc_str_new
48 #undef rb_str_new_cstr
49 #undef rb_tainted_str_new_cstr
50 #undef rb_usascii_str_new_cstr
51 #undef rb_utf8_str_new_cstr
52 #undef rb_enc_str_new_cstr
53 #undef rb_external_str_new_cstr
54 #undef rb_locale_str_new_cstr
55 #undef rb_str_dup_frozen
56 #undef rb_str_buf_new_cstr
57 #undef rb_str_buf_cat
58 #undef rb_str_buf_cat2
59 #undef rb_str_cat2
60 #undef rb_str_cat_cstr
61 #undef rb_fstring_cstr
62 #undef rb_fstring_enc_cstr
63 
64 static VALUE rb_str_clear(VALUE str);
65 
68 
69 /* FLAGS of RString
70  *
71  * 1: RSTRING_NOEMBED
72  * 2: STR_SHARED (== ELTS_SHARED)
73  * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
74  * 6: STR_IS_SHARED_M (shared, when RSTRING_NOEMBED==1 && klass==0)
75  * 7: STR_TMPLOCK
76  * 8-9: ENC_CODERANGE (2 bits)
77  * 10-16: ENCODING (7 bits == 128)
78  * 17: RSTRING_FSTR
79  * 18: STR_NOFREE
80  * 19: STR_FAKESTR
81  */
82 
83 #define RUBY_MAX_CHAR_LEN 16
84 #define STR_IS_SHARED_M FL_USER6
85 #define STR_TMPLOCK FL_USER7
86 #define STR_NOFREE FL_USER18
87 #define STR_FAKESTR FL_USER19
88 
89 #define STR_SET_NOEMBED(str) do {\
90  FL_SET((str), STR_NOEMBED);\
91  STR_SET_EMBED_LEN((str), 0);\
92 } while (0)
93 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
94 #define STR_SET_EMBED_LEN(str, n) do { \
95  long tmp_n = (n);\
96  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
97  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
98 } while (0)
99 
100 #define STR_SET_LEN(str, n) do { \
101  if (STR_EMBED_P(str)) {\
102  STR_SET_EMBED_LEN((str), (n));\
103  }\
104  else {\
105  RSTRING(str)->as.heap.len = (n);\
106  }\
107 } while (0)
108 
109 #define STR_DEC_LEN(str) do {\
110  if (STR_EMBED_P(str)) {\
111  long n = RSTRING_LEN(str);\
112  n--;\
113  STR_SET_EMBED_LEN((str), n);\
114  }\
115  else {\
116  RSTRING(str)->as.heap.len--;\
117  }\
118 } while (0)
119 
120 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
121 #define TERM_FILL(ptr, termlen) do {\
122  char *const term_fill_ptr = (ptr);\
123  const int term_fill_len = (termlen);\
124  *term_fill_ptr = '\0';\
125  if (UNLIKELY(term_fill_len > 1))\
126  memset(term_fill_ptr, 0, term_fill_len);\
127 } while (0)
128 
129 #define RESIZE_CAPA(str,capacity) do {\
130  const int termlen = TERM_LEN(str);\
131  RESIZE_CAPA_TERM(str,capacity,termlen);\
132 } while (0)
133 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
134  if (STR_EMBED_P(str)) {\
135  if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
136  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
137  const long tlen = RSTRING_LEN(str);\
138  memcpy(tmp, RSTRING_PTR(str), tlen);\
139  RSTRING(str)->as.heap.ptr = tmp;\
140  RSTRING(str)->as.heap.len = tlen;\
141  STR_SET_NOEMBED(str);\
142  RSTRING(str)->as.heap.aux.capa = (capacity);\
143  }\
144  }\
145  else {\
146  assert(!FL_TEST((str), STR_SHARED)); \
147  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)(capacity) + (termlen));\
148  RSTRING(str)->as.heap.aux.capa = (capacity);\
149  }\
150 } while (0)
151 
152 #define STR_SET_SHARED(str, shared_str) do { \
153  if (!FL_TEST(str, STR_FAKESTR)) { \
154  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
155  FL_SET((str), STR_SHARED); \
156  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
157  FL_SET_RAW((shared_str), STR_IS_SHARED_M); \
158  } \
159 } while (0)
160 
161 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
162 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
163 
164 #define STR_ENC_GET(str) get_encoding(str)
165 
166 #if !defined SHARABLE_MIDDLE_SUBSTRING
167 # define SHARABLE_MIDDLE_SUBSTRING 0
168 #endif
169 #if !SHARABLE_MIDDLE_SUBSTRING
170 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
171 #else
172 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
173 #endif
174 
175 #define STR_EMBEDDABLE_P(len, termlen) \
176  ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
177 
178 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
179 static VALUE str_new_shared(VALUE klass, VALUE str);
180 static VALUE str_new_frozen(VALUE klass, VALUE orig);
181 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
182 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
183 static inline void str_modifiable(VALUE str);
184 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
185 
186 static inline void
187 str_make_independent(VALUE str)
188 {
189  long len = RSTRING_LEN(str);
190  int termlen = TERM_LEN(str);
191  str_make_independent_expand((str), len, 0L, termlen);
192 }
193 
194 /* symbols for [up|down|swap]case/capitalize options */
195 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
196 
197 static rb_encoding *
198 get_actual_encoding(const int encidx, VALUE str)
199 {
200  const unsigned char *q;
201 
202  switch (encidx) {
203  case ENCINDEX_UTF_16:
204  if (RSTRING_LEN(str) < 2) break;
205  q = (const unsigned char *)RSTRING_PTR(str);
206  if (q[0] == 0xFE && q[1] == 0xFF) {
208  }
209  if (q[0] == 0xFF && q[1] == 0xFE) {
211  }
212  return rb_ascii8bit_encoding();
213  case ENCINDEX_UTF_32:
214  if (RSTRING_LEN(str) < 4) break;
215  q = (const unsigned char *)RSTRING_PTR(str);
216  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
218  }
219  if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
221  }
222  return rb_ascii8bit_encoding();
223  }
224  return rb_enc_from_index(encidx);
225 }
226 
227 static rb_encoding *
228 get_encoding(VALUE str)
229 {
230  return get_actual_encoding(ENCODING_GET(str), str);
231 }
232 
233 static void
234 mustnot_broken(VALUE str)
235 {
236  if (is_broken_string(str)) {
237  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
238  }
239 }
240 
241 static void
242 mustnot_wchar(VALUE str)
243 {
244  rb_encoding *enc = STR_ENC_GET(str);
245  if (rb_enc_mbminlen(enc) > 1) {
246  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
247  }
248 }
249 
250 static int fstring_cmp(VALUE a, VALUE b);
251 
252 static VALUE register_fstring(VALUE str);
253 
255  fstring_cmp,
256  rb_str_hash,
257 };
258 
259 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
260 
261 static int
262 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
263 {
264  VALUE *fstr = (VALUE *)arg;
265  VALUE str = (VALUE)*key;
266 
267  if (existing) {
268  /* because of lazy sweep, str may be unmarked already and swept
269  * at next time */
270 
271  if (rb_objspace_garbage_object_p(str)) {
272  *fstr = Qundef;
273  return ST_DELETE;
274  }
275 
276  *fstr = str;
277  return ST_STOP;
278  }
279  else {
280  if (FL_TEST_RAW(str, STR_FAKESTR)) {
281  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
282  RSTRING(str)->as.heap.len,
283  ENCODING_GET(str));
284  OBJ_FREEZE_RAW(str);
285  }
286  else {
287  str = str_new_frozen(rb_cString, str);
288  if (STR_SHARED_P(str)) { /* str should not be shared */
289  /* shared substring */
290  str_make_independent(str);
291  assert(OBJ_FROZEN(str));
292  }
293  if (!BARE_STRING_P(str)) {
294  str = str_new_frozen(rb_cString, str);
295  }
296  }
297  RBASIC(str)->flags |= RSTRING_FSTR;
298 
299  *key = *value = *fstr = str;
300  return ST_CONTINUE;
301  }
302 }
303 
305 VALUE
307 {
308  VALUE fstr;
309  int bare;
310 
311  Check_Type(str, T_STRING);
312 
313  if (FL_TEST(str, RSTRING_FSTR))
314  return str;
315 
316  bare = BARE_STRING_P(str);
317  if (STR_EMBED_P(str) && !bare) {
318  OBJ_FREEZE_RAW(str);
319  return str;
320  }
321 
322  fstr = register_fstring(str);
323 
324  if (!bare) {
325  str_replace_shared_without_enc(str, fstr);
326  OBJ_FREEZE_RAW(str);
327  return str;
328  }
329  return fstr;
330 }
331 
332 static VALUE
333 register_fstring(VALUE str)
334 {
335  VALUE ret;
336  st_table *frozen_strings = rb_vm_fstring_table();
337 
338  do {
339  ret = str;
340  st_update(frozen_strings, (st_data_t)str,
341  fstr_update_callback, (st_data_t)&ret);
342  } while (ret == Qundef);
343 
344  assert(OBJ_FROZEN(ret));
346  assert(!FL_TEST_RAW(ret, FL_EXIVAR));
347  assert(!FL_TEST_RAW(ret, FL_TAINT));
348  assert(RBASIC_CLASS(ret) == rb_cString);
349  return ret;
350 }
351 
352 static VALUE
353 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
354 {
356  /* SHARED to be allocated by the callback */
357 
358  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
359 
361  fake_str->as.heap.len = len;
362  fake_str->as.heap.ptr = (char *)name;
363  fake_str->as.heap.aux.capa = len;
364  return (VALUE)fake_str;
365 }
366 
367 VALUE
368 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
369 {
370  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
371 }
372 
373 VALUE
374 rb_fstring_new(const char *ptr, long len)
375 {
376  struct RString fake_str;
377  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII));
378 }
379 
380 VALUE
381 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
382 {
383  struct RString fake_str;
384  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc));
385 }
386 
387 VALUE
388 rb_fstring_cstr(const char *ptr)
389 {
390  return rb_fstring_new(ptr, strlen(ptr));
391 }
392 
393 VALUE
395 {
396  return rb_fstring_enc_new(ptr, strlen(ptr), enc);
397 }
398 
399 static int
400 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
401 {
402  RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
403  return ST_CONTINUE;
404 }
405 
406 static int
407 fstring_cmp(VALUE a, VALUE b)
408 {
409  long alen, blen;
410  const char *aptr, *bptr;
411  RSTRING_GETMEM(a, aptr, alen);
412  RSTRING_GETMEM(b, bptr, blen);
413  return (alen != blen ||
414  ENCODING_GET(a) != ENCODING_GET(b) ||
415  memcmp(aptr, bptr, alen) != 0);
416 }
417 
418 static inline int
419 single_byte_optimizable(VALUE str)
420 {
421  rb_encoding *enc;
422 
423  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
424  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
425  return 1;
426 
427  enc = STR_ENC_GET(str);
428  if (rb_enc_mbmaxlen(enc) == 1)
429  return 1;
430 
431  /* Conservative. Possibly single byte.
432  * "\xa1" in Shift_JIS for example. */
433  return 0;
434 }
435 
437 
438 static inline const char *
439 search_nonascii(const char *p, const char *e)
440 {
441  const uintptr_t *s, *t;
442 #if SIZEOF_VOIDP == 8
443 # define NONASCII_MASK 0x8080808080808080ULL
444 #elif SIZEOF_VOIDP == 4
445 # define NONASCII_MASK 0x80808080UL
446 #endif
447 
448  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
449 #if !UNALIGNED_WORD_ACCESS
450  if ((uintptr_t)p % SIZEOF_VOIDP) {
451  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
452  p += l;
453  switch (l) {
454  default: UNREACHABLE;
455 #if SIZEOF_VOIDP > 4
456  case 7: if (p[-7]&0x80) return p-7;
457  case 6: if (p[-6]&0x80) return p-6;
458  case 5: if (p[-5]&0x80) return p-5;
459  case 4: if (p[-4]&0x80) return p-4;
460 #endif
461  case 3: if (p[-3]&0x80) return p-3;
462  case 2: if (p[-2]&0x80) return p-2;
463  case 1: if (p[-1]&0x80) return p-1;
464  case 0: break;
465  }
466  }
467 #endif
468  s = (const uintptr_t *)p;
469  t = (const uintptr_t *)(e - (SIZEOF_VOIDP-1));
470  for (;s < t; s++) {
471  if (*s & NONASCII_MASK) {
472 #ifdef WORDS_BIGENDIAN
473  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
474 #else
475  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
476 #endif
477  }
478  }
479  p = (const char *)s;
480  }
481 
482  switch (e - p) {
483  default: UNREACHABLE;
484 #if SIZEOF_VOIDP > 4
485  case 7: if (e[-7]&0x80) return e-7;
486  case 6: if (e[-6]&0x80) return e-6;
487  case 5: if (e[-5]&0x80) return e-5;
488  case 4: if (e[-4]&0x80) return e-4;
489 #endif
490  case 3: if (e[-3]&0x80) return e-3;
491  case 2: if (e[-2]&0x80) return e-2;
492  case 1: if (e[-1]&0x80) return e-1;
493  case 0: return NULL;
494  }
495 }
496 
497 static int
498 coderange_scan(const char *p, long len, rb_encoding *enc)
499 {
500  const char *e = p + len;
501 
502  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
503  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
504  p = search_nonascii(p, e);
506  }
507 
508  if (rb_enc_asciicompat(enc)) {
509  p = search_nonascii(p, e);
510  if (!p) return ENC_CODERANGE_7BIT;
511  for (;;) {
512  int ret = rb_enc_precise_mbclen(p, e, enc);
513  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
514  p += MBCLEN_CHARFOUND_LEN(ret);
515  if (p == e) break;
516  p = search_nonascii(p, e);
517  if (!p) break;
518  }
519  }
520  else {
521  while (p < e) {
522  int ret = rb_enc_precise_mbclen(p, e, enc);
523  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
524  p += MBCLEN_CHARFOUND_LEN(ret);
525  }
526  }
527  return ENC_CODERANGE_VALID;
528 }
529 
530 long
531 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
532 {
533  const char *p = s;
534 
535  if (*cr == ENC_CODERANGE_BROKEN)
536  return e - s;
537 
538  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
539  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
540  if (*cr == ENC_CODERANGE_VALID) return e - s;
541  p = search_nonascii(p, e);
543  return e - s;
544  }
545  else if (rb_enc_asciicompat(enc)) {
546  p = search_nonascii(p, e);
547  if (!p) {
548  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
549  return e - s;
550  }
551  for (;;) {
552  int ret = rb_enc_precise_mbclen(p, e, enc);
553  if (!MBCLEN_CHARFOUND_P(ret)) {
555  return p - s;
556  }
557  p += MBCLEN_CHARFOUND_LEN(ret);
558  if (p == e) break;
559  p = search_nonascii(p, e);
560  if (!p) break;
561  }
562  }
563  else {
564  while (p < e) {
565  int ret = rb_enc_precise_mbclen(p, e, enc);
566  if (!MBCLEN_CHARFOUND_P(ret)) {
568  return p - s;
569  }
570  p += MBCLEN_CHARFOUND_LEN(ret);
571  }
572  }
573  *cr = ENC_CODERANGE_VALID;
574  return e - s;
575 }
576 
577 static inline void
578 str_enc_copy(VALUE str1, VALUE str2)
579 {
580  rb_enc_set_index(str1, ENCODING_GET(str2));
581 }
582 
583 static void
584 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
585 {
586  /* this function is designed for copying encoding and coderange
587  * from src to new string "dest" which is made from the part of src.
588  */
589  str_enc_copy(dest, src);
590  if (RSTRING_LEN(dest) == 0) {
591  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
593  else
595  return;
596  }
597  switch (ENC_CODERANGE(src)) {
598  case ENC_CODERANGE_7BIT:
600  break;
601  case ENC_CODERANGE_VALID:
602  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
603  search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
605  else
607  break;
608  default:
609  break;
610  }
611 }
612 
613 static void
614 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
615 {
616  str_enc_copy(dest, src);
617  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
618 }
619 
620 int
622 {
623  int cr = ENC_CODERANGE(str);
624 
625  if (cr == ENC_CODERANGE_UNKNOWN) {
626  int encidx = ENCODING_GET(str);
627  rb_encoding *enc = rb_enc_from_index(encidx);
628  if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc)) {
630  }
631  else {
632  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
633  get_actual_encoding(encidx, str));
634  }
635  ENC_CODERANGE_SET(str, cr);
636  }
637  return cr;
638 }
639 
640 int
642 {
643  rb_encoding *enc = STR_ENC_GET(str);
644 
645  if (!rb_enc_asciicompat(enc))
646  return FALSE;
647  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
648  return TRUE;
649  return FALSE;
650 }
651 
652 static inline void
653 str_mod_check(VALUE s, const char *p, long len)
654 {
655  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
656  rb_raise(rb_eRuntimeError, "string modified");
657  }
658 }
659 
660 static size_t
661 str_capacity(VALUE str, const int termlen)
662 {
663  if (STR_EMBED_P(str)) {
664  return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
665  }
666  else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
667  return RSTRING(str)->as.heap.len;
668  }
669  else {
670  return RSTRING(str)->as.heap.aux.capa;
671  }
672 }
673 
674 size_t
676 {
677  return str_capacity(str, TERM_LEN(str));
678 }
679 
680 static inline void
681 must_not_null(const char *ptr)
682 {
683  if (!ptr) {
684  rb_raise(rb_eArgError, "NULL pointer given");
685  }
686 }
687 
688 static inline VALUE
689 str_alloc(VALUE klass)
690 {
692  return (VALUE)str;
693 }
694 
695 static inline VALUE
696 empty_str_alloc(VALUE klass)
697 {
698  RUBY_DTRACE_CREATE_HOOK(STRING, 0);
699  return str_alloc(klass);
700 }
701 
702 static VALUE
703 str_new0(VALUE klass, const char *ptr, long len, int termlen)
704 {
705  VALUE str;
706 
707  if (len < 0) {
708  rb_raise(rb_eArgError, "negative string size (or size too big)");
709  }
710 
711  RUBY_DTRACE_CREATE_HOOK(STRING, len);
712 
713  str = str_alloc(klass);
714  if (!STR_EMBEDDABLE_P(len, termlen)) {
715  RSTRING(str)->as.heap.aux.capa = len;
716  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
717  STR_SET_NOEMBED(str);
718  }
719  else if (len == 0) {
721  }
722  if (ptr) {
723  memcpy(RSTRING_PTR(str), ptr, len);
724  }
725  STR_SET_LEN(str, len);
726  TERM_FILL(RSTRING_PTR(str) + len, termlen);
727  return str;
728 }
729 
730 static VALUE
731 str_new(VALUE klass, const char *ptr, long len)
732 {
733  return str_new0(klass, ptr, len, 1);
734 }
735 
736 VALUE
737 rb_str_new(const char *ptr, long len)
738 {
739  return str_new(rb_cString, ptr, len);
740 }
741 
742 VALUE
743 rb_usascii_str_new(const char *ptr, long len)
744 {
745  VALUE str = rb_str_new(ptr, len);
747  return str;
748 }
749 
750 VALUE
751 rb_utf8_str_new(const char *ptr, long len)
752 {
753  VALUE str = str_new(rb_cString, ptr, len);
755  return str;
756 }
757 
758 VALUE
759 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
760 {
761  VALUE str;
762 
763  if (!enc) return rb_str_new(ptr, len);
764 
765  str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
766  rb_enc_associate(str, enc);
767  return str;
768 }
769 
770 VALUE
771 rb_str_new_cstr(const char *ptr)
772 {
773  must_not_null(ptr);
774  return rb_str_new(ptr, strlen(ptr));
775 }
776 
777 VALUE
778 rb_usascii_str_new_cstr(const char *ptr)
779 {
780  VALUE str = rb_str_new_cstr(ptr);
782  return str;
783 }
784 
785 VALUE
786 rb_utf8_str_new_cstr(const char *ptr)
787 {
788  VALUE str = rb_str_new_cstr(ptr);
790  return str;
791 }
792 
793 VALUE
794 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
795 {
796  must_not_null(ptr);
797  if (rb_enc_mbminlen(enc) != 1) {
798  rb_raise(rb_eArgError, "wchar encoding given");
799  }
800  return rb_enc_str_new(ptr, strlen(ptr), enc);
801 }
802 
803 static VALUE
804 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
805 {
806  VALUE str;
807 
808  if (len < 0) {
809  rb_raise(rb_eArgError, "negative string size (or size too big)");
810  }
811 
812  if (!ptr) {
813  rb_encoding *enc = rb_enc_get_from_index(encindex);
814  str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
815  }
816  else {
817  RUBY_DTRACE_CREATE_HOOK(STRING, len);
818  str = str_alloc(klass);
819  RSTRING(str)->as.heap.len = len;
820  RSTRING(str)->as.heap.ptr = (char *)ptr;
821  RSTRING(str)->as.heap.aux.capa = len;
822  STR_SET_NOEMBED(str);
823  RBASIC(str)->flags |= STR_NOFREE;
824  }
825  rb_enc_associate_index(str, encindex);
826  return str;
827 }
828 
829 VALUE
830 rb_str_new_static(const char *ptr, long len)
831 {
832  return str_new_static(rb_cString, ptr, len, 0);
833 }
834 
835 VALUE
836 rb_usascii_str_new_static(const char *ptr, long len)
837 {
838  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
839 }
840 
841 VALUE
842 rb_utf8_str_new_static(const char *ptr, long len)
843 {
844  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
845 }
846 
847 VALUE
848 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
849 {
850  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
851 }
852 
853 VALUE
854 rb_tainted_str_new(const char *ptr, long len)
855 {
856  VALUE str = rb_str_new(ptr, len);
857 
858  OBJ_TAINT(str);
859  return str;
860 }
861 
862 static VALUE
863 rb_tainted_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
864 {
865  VALUE str = rb_enc_str_new(ptr, len, enc);
866 
867  OBJ_TAINT(str);
868  return str;
869 }
870 
871 VALUE
872 rb_tainted_str_new_cstr(const char *ptr)
873 {
874  VALUE str = rb_str_new_cstr(ptr);
875 
876  OBJ_TAINT(str);
877  return str;
878 }
879 
880 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
881  rb_encoding *from, rb_encoding *to,
882  int ecflags, VALUE ecopts);
883 
884 VALUE
885 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
886 {
887  long len;
888  const char *ptr;
889  VALUE newstr;
890 
891  if (!to) return str;
892  if (!from) from = rb_enc_get(str);
893  if (from == to) return str;
894  if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
895  to == rb_ascii8bit_encoding()) {
896  if (STR_ENC_GET(str) != to) {
897  str = rb_str_dup(str);
898  rb_enc_associate(str, to);
899  }
900  return str;
901  }
902 
903  RSTRING_GETMEM(str, ptr, len);
904  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
905  from, to, ecflags, ecopts);
906  if (NIL_P(newstr)) {
907  /* some error, return original */
908  return str;
909  }
910  OBJ_INFECT(newstr, str);
911  return newstr;
912 }
913 
914 VALUE
915 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
916  rb_encoding *from, int ecflags, VALUE ecopts)
917 {
918  long olen;
919 
920  olen = RSTRING_LEN(newstr);
921  if (ofs < -olen || olen < ofs)
922  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
923  if (ofs < 0) ofs += olen;
924  if (!from) {
925  STR_SET_LEN(newstr, ofs);
926  return rb_str_cat(newstr, ptr, len);
927  }
928 
929  rb_str_modify(newstr);
930  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
931  rb_enc_get(newstr),
932  ecflags, ecopts);
933 }
934 
935 VALUE
936 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
937 {
938  STR_SET_LEN(str, 0);
939  rb_enc_associate(str, enc);
940  rb_str_cat(str, ptr, len);
941  return str;
942 }
943 
944 static VALUE
945 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
946  rb_encoding *from, rb_encoding *to,
947  int ecflags, VALUE ecopts)
948 {
949  rb_econv_t *ec;
950  rb_econv_result_t ret;
951  long olen;
952  VALUE econv_wrapper;
953  const unsigned char *start, *sp;
954  unsigned char *dest, *dp;
955  size_t converted_output = (size_t)ofs;
956 
957  olen = rb_str_capacity(newstr);
958 
959  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
960  RBASIC_CLEAR_CLASS(econv_wrapper);
961  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
962  if (!ec) return Qnil;
963  DATA_PTR(econv_wrapper) = ec;
964 
965  sp = (unsigned char*)ptr;
966  start = sp;
967  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
968  (dp = dest + converted_output),
969  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
971  /* destination buffer short */
972  size_t converted_input = sp - start;
973  size_t rest = len - converted_input;
974  converted_output = dp - dest;
975  rb_str_set_len(newstr, converted_output);
976  if (converted_input && converted_output &&
977  rest < (LONG_MAX / converted_output)) {
978  rest = (rest * converted_output) / converted_input;
979  }
980  else {
981  rest = olen;
982  }
983  olen += rest < 2 ? 2 : rest;
984  rb_str_resize(newstr, olen);
985  }
986  DATA_PTR(econv_wrapper) = 0;
987  rb_econv_close(ec);
988  rb_gc_force_recycle(econv_wrapper);
989  switch (ret) {
990  case econv_finished:
991  len = dp - (unsigned char*)RSTRING_PTR(newstr);
992  rb_str_set_len(newstr, len);
993  rb_enc_associate(newstr, to);
994  return newstr;
995 
996  default:
997  return Qnil;
998  }
999 }
1000 
1001 VALUE
1003 {
1004  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1005 }
1006 
1007 VALUE
1008 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1009 {
1010  rb_encoding *ienc;
1011  VALUE str;
1012  const int eidx = rb_enc_to_index(eenc);
1013 
1014  /* ASCII-8BIT case, no conversion */
1015  if ((eidx == rb_ascii8bit_encindex()) ||
1016  (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1017  return rb_tainted_str_new(ptr, len);
1018  }
1019  /* no default_internal or same encoding, no conversion */
1021  if (!ienc || eenc == ienc) {
1022  return rb_tainted_str_new_with_enc(ptr, len, eenc);
1023  }
1024  /* ASCII compatible, and ASCII only string, no conversion in
1025  * default_internal */
1026  if ((eidx == rb_ascii8bit_encindex()) ||
1027  (eidx == rb_usascii_encindex()) ||
1028  (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1029  return rb_tainted_str_new_with_enc(ptr, len, ienc);
1030  }
1031  /* convert from the given encoding to default_internal */
1032  str = rb_tainted_str_new_with_enc(NULL, 0, ienc);
1033  /* when the conversion failed for some reason, just ignore the
1034  * default_internal and result in the given encoding as-is. */
1035  if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1036  rb_str_initialize(str, ptr, len, eenc);
1037  }
1038  return str;
1039 }
1040 
1041 VALUE
1043 {
1044  int eidx = rb_enc_to_index(eenc);
1045  if (eidx == rb_usascii_encindex() &&
1048  return str;
1049  }
1050  rb_enc_associate_index(str, eidx);
1051  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1052 }
1053 
1054 VALUE
1055 rb_external_str_new(const char *ptr, long len)
1056 {
1058 }
1059 
1060 VALUE
1062 {
1064 }
1065 
1066 VALUE
1067 rb_locale_str_new(const char *ptr, long len)
1068 {
1070 }
1071 
1072 VALUE
1073 rb_locale_str_new_cstr(const char *ptr)
1074 {
1076 }
1077 
1078 VALUE
1079 rb_filesystem_str_new(const char *ptr, long len)
1080 {
1082 }
1083 
1084 VALUE
1086 {
1088 }
1089 
1090 VALUE
1092 {
1094 }
1095 
1096 VALUE
1098 {
1099  return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
1100 }
1101 
1102 VALUE
1104 {
1105  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1106 }
1107 
1108 static VALUE
1109 str_replace_shared_without_enc(VALUE str2, VALUE str)
1110 {
1111  const int termlen = TERM_LEN(str);
1112  char *ptr;
1113  long len;
1114 
1115  RSTRING_GETMEM(str, ptr, len);
1116  if (STR_EMBEDDABLE_P(len, termlen)) {
1117  char *ptr2 = RSTRING(str2)->as.ary;
1118  STR_SET_EMBED(str2);
1119  memcpy(ptr2, RSTRING_PTR(str), len);
1120  STR_SET_EMBED_LEN(str2, len);
1121  TERM_FILL(ptr2+len, termlen);
1122  }
1123  else {
1124  str = rb_str_new_frozen(str);
1125  FL_SET(str2, STR_NOEMBED);
1126  RSTRING_GETMEM(str, ptr, len);
1127  RSTRING(str2)->as.heap.len = len;
1128  RSTRING(str2)->as.heap.ptr = ptr;
1129  STR_SET_SHARED(str2, str);
1130  }
1131  return str2;
1132 }
1133 
1134 static VALUE
1135 str_replace_shared(VALUE str2, VALUE str)
1136 {
1137  str_replace_shared_without_enc(str2, str);
1138  rb_enc_cr_str_exact_copy(str2, str);
1139  return str2;
1140 }
1141 
1142 static VALUE
1143 str_new_shared(VALUE klass, VALUE str)
1144 {
1145  return str_replace_shared(str_alloc(klass), str);
1146 }
1147 
1148 VALUE
1150 {
1151  VALUE str2 = str_new_shared(rb_obj_class(str), str);
1152 
1153  OBJ_INFECT(str2, str);
1154  return str2;
1155 }
1156 
1157 VALUE
1159 {
1160  VALUE str;
1161 
1162  if (OBJ_FROZEN(orig)) return orig;
1163 
1164  str = str_new_frozen(rb_obj_class(orig), orig);
1165  OBJ_INFECT(str, orig);
1166  return str;
1167 }
1168 
1169 VALUE
1171 {
1172  VALUE tmp;
1173 
1174  if (OBJ_FROZEN_RAW(orig)) return orig;
1175 
1176  tmp = str_new_frozen(0, orig);
1177  OBJ_INFECT(tmp, orig);
1178 
1179  return tmp;
1180 }
1181 
1182 void
1184 {
1185  if (RBASIC_CLASS(tmp) != 0)
1186  return;
1187 
1188  if (STR_EMBED_P(tmp)) {
1189  assert(OBJ_FROZEN_RAW(tmp));
1190  rb_gc_force_recycle(tmp);
1191  }
1192  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1194  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1195 
1196  if (shared == tmp && !FL_TEST_RAW(tmp, STR_IS_SHARED_M)) {
1197  FL_UNSET_RAW(orig, STR_SHARED);
1198  assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1199  assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1200  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1201  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1202  assert(OBJ_FROZEN_RAW(tmp));
1203  rb_gc_force_recycle(tmp);
1204  }
1205  }
1206 }
1207 
1208 static VALUE
1209 str_new_frozen(VALUE klass, VALUE orig)
1210 {
1211  VALUE str;
1212 
1213  if (STR_EMBED_P(orig)) {
1214  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1215  }
1216  else {
1217  if (FL_TEST_RAW(orig, STR_SHARED)) {
1218  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1219  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1220  long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1221  assert(!STR_EMBED_P(shared));
1222  assert(OBJ_FROZEN(shared));
1223 
1224  if ((ofs > 0) || (rest > 0) ||
1225  (klass != RBASIC(shared)->klass) ||
1226  ((RBASIC(shared)->flags ^ RBASIC(orig)->flags) & FL_TAINT) ||
1227  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1228  str = str_new_shared(klass, shared);
1229  RSTRING(str)->as.heap.ptr += ofs;
1230  RSTRING(str)->as.heap.len -= ofs + rest;
1231  }
1232  else {
1233  if (RBASIC_CLASS(shared) == 0)
1234  FL_SET_RAW(shared, STR_IS_SHARED_M);
1235  return shared;
1236  }
1237  }
1238  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1239  str = str_alloc(klass);
1240  STR_SET_EMBED(str);
1241  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1242  STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1243  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1244  }
1245  else {
1246  str = str_alloc(klass);
1247  STR_SET_NOEMBED(str);
1248  RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1249  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1250  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1251  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1252  RBASIC(orig)->flags &= ~STR_NOFREE;
1253  STR_SET_SHARED(orig, str);
1254  if (klass == 0)
1256  }
1257  }
1258 
1259  rb_enc_cr_str_exact_copy(str, orig);
1260  OBJ_FREEZE(str);
1261  return str;
1262 }
1263 
1264 VALUE
1265 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1266 {
1267  return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1268 }
1269 
1270 static VALUE
1271 str_new_empty(VALUE str)
1272 {
1273  VALUE v = rb_str_new_with_class(str, 0, 0);
1274  rb_enc_copy(v, str);
1275  OBJ_INFECT(v, str);
1276  return v;
1277 }
1278 
1279 #define STR_BUF_MIN_SIZE 127
1280 
1281 VALUE
1283 {
1284  VALUE str = str_alloc(rb_cString);
1285 
1286  if (capa < STR_BUF_MIN_SIZE) {
1287  capa = STR_BUF_MIN_SIZE;
1288  }
1289  FL_SET(str, STR_NOEMBED);
1290  RSTRING(str)->as.heap.aux.capa = capa;
1291  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1292  RSTRING(str)->as.heap.ptr[0] = '\0';
1293 
1294  return str;
1295 }
1296 
1297 VALUE
1298 rb_str_buf_new_cstr(const char *ptr)
1299 {
1300  VALUE str;
1301  long len = strlen(ptr);
1302 
1303  str = rb_str_buf_new(len);
1304  rb_str_buf_cat(str, ptr, len);
1305 
1306  return str;
1307 }
1308 
1309 VALUE
1311 {
1312  return str_new(0, 0, len);
1313 }
1314 
1315 void
1317 {
1318  if (FL_TEST(str, RSTRING_FSTR)) {
1319  st_data_t fstr = (st_data_t)str;
1320  st_delete(rb_vm_fstring_table(), &fstr, NULL);
1321  RB_DEBUG_COUNTER_INC(obj_str_fstr);
1322  }
1323 
1324  if (STR_EMBED_P(str)) {
1325  RB_DEBUG_COUNTER_INC(obj_str_embed);
1326  }
1327  else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1328  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1329  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1330  }
1331  else {
1332  RB_DEBUG_COUNTER_INC(obj_str_ptr);
1334  }
1335 }
1336 
1337 RUBY_FUNC_EXPORTED size_t
1339 {
1341  return STR_HEAP_SIZE(str);
1342  }
1343  else {
1344  return 0;
1345  }
1346 }
1347 
1348 VALUE
1350 {
1351  return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1352 }
1353 
1354 static inline void str_discard(VALUE str);
1355 static void str_shared_replace(VALUE str, VALUE str2);
1356 
1357 void
1359 {
1360  if (str != str2) str_shared_replace(str, str2);
1361 }
1362 
1363 static void
1364 str_shared_replace(VALUE str, VALUE str2)
1365 {
1366  rb_encoding *enc;
1367  int cr;
1368  int termlen;
1369 
1370  RUBY_ASSERT(str2 != str);
1371  enc = STR_ENC_GET(str2);
1372  cr = ENC_CODERANGE(str2);
1373  str_discard(str);
1374  OBJ_INFECT(str, str2);
1375  termlen = rb_enc_mbminlen(enc);
1376 
1377  if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1378  STR_SET_EMBED(str);
1379  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1380  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1381  rb_enc_associate(str, enc);
1382  ENC_CODERANGE_SET(str, cr);
1383  }
1384  else {
1385  STR_SET_NOEMBED(str);
1386  FL_UNSET(str, STR_SHARED);
1387  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1388  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1389 
1390  if (FL_TEST(str2, STR_SHARED)) {
1391  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1392  STR_SET_SHARED(str, shared);
1393  }
1394  else {
1395  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1396  }
1397 
1398  /* abandon str2 */
1399  STR_SET_EMBED(str2);
1400  RSTRING_PTR(str2)[0] = 0;
1401  STR_SET_EMBED_LEN(str2, 0);
1402  rb_enc_associate(str, enc);
1403  ENC_CODERANGE_SET(str, cr);
1404  }
1405 }
1406 
1408 
1409 VALUE
1411 {
1412  VALUE str;
1413 
1414  if (RB_TYPE_P(obj, T_STRING)) {
1415  return obj;
1416  }
1417  str = rb_funcall(obj, idTo_s, 0);
1418  return rb_obj_as_string_result(str, obj);
1419 }
1420 
1421 VALUE
1423 {
1424  if (!RB_TYPE_P(str, T_STRING))
1425  return rb_any_to_s(obj);
1426  if (!FL_TEST_RAW(str, RSTRING_FSTR) && FL_ABLE(obj))
1427  /* fstring must not be tainted, at least */
1428  OBJ_INFECT_RAW(str, obj);
1429  return str;
1430 }
1431 
1432 static VALUE
1433 str_replace(VALUE str, VALUE str2)
1434 {
1435  long len;
1436 
1437  len = RSTRING_LEN(str2);
1438  if (STR_SHARED_P(str2)) {
1439  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1440  assert(OBJ_FROZEN(shared));
1441  STR_SET_NOEMBED(str);
1442  RSTRING(str)->as.heap.len = len;
1443  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1444  STR_SET_SHARED(str, shared);
1445  rb_enc_cr_str_exact_copy(str, str2);
1446  }
1447  else {
1448  str_replace_shared(str, str2);
1449  }
1450 
1451  OBJ_INFECT(str, str2);
1452  return str;
1453 }
1454 
1455 static inline VALUE
1456 str_duplicate(VALUE klass, VALUE str)
1457 {
1458  enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1459  const VALUE flag_mask =
1463  ;
1464  VALUE flags = FL_TEST_RAW(str, flag_mask);
1465  VALUE dup = str_alloc(klass);
1466  MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1467  char, embed_size);
1468  if (flags & STR_NOEMBED) {
1469  if (UNLIKELY(!(flags & FL_FREEZE))) {
1470  str = str_new_frozen(klass, str);
1471  FL_SET_RAW(str, flags & FL_TAINT);
1472  flags = FL_TEST_RAW(str, flag_mask);
1473  }
1474  if (flags & STR_NOEMBED) {
1475  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1476  flags |= STR_SHARED;
1477  }
1478  else {
1479  MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1480  char, embed_size);
1481  }
1482  }
1483  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1484  return dup;
1485 }
1486 
1487 VALUE
1489 {
1490  return str_duplicate(rb_obj_class(str), str);
1491 }
1492 
1493 VALUE
1495 {
1496  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1497  return str_duplicate(rb_cString, str);
1498 }
1499 
1500 /*
1501  * call-seq:
1502  * String.new(str="") -> new_str
1503  * String.new(str="", encoding: enc) -> new_str
1504  * String.new(str="", capacity: size) -> new_str
1505  *
1506  * Returns a new string object containing a copy of <i>str</i>.
1507  *
1508  * The optional <i>encoding</i> keyword argument specifies the encoding
1509  * of the new string.
1510  * If not specified, the encoding of <i>str</i> is used
1511  * (or ASCII-8BIT, if <i>str</i> is not specified).
1512  *
1513  * The optional <i>capacity</i> keyword argument specifies the size
1514  * of the internal buffer.
1515  * This may improve performance, when the string will be concatenated many
1516  * times (causing many realloc calls).
1517  */
1518 
1519 static VALUE
1520 rb_str_init(int argc, VALUE *argv, VALUE str)
1521 {
1522  static ID keyword_ids[2];
1523  VALUE orig, opt, venc, vcapa;
1524  VALUE kwargs[2];
1525  rb_encoding *enc = 0;
1526  int n;
1527 
1528  if (!keyword_ids[0]) {
1529  keyword_ids[0] = rb_id_encoding();
1530  CONST_ID(keyword_ids[1], "capacity");
1531  }
1532 
1533  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1534  if (!NIL_P(opt)) {
1535  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1536  venc = kwargs[0];
1537  vcapa = kwargs[1];
1538  if (venc != Qundef && !NIL_P(venc)) {
1539  enc = rb_to_encoding(venc);
1540  }
1541  if (vcapa != Qundef && !NIL_P(vcapa)) {
1542  long capa = NUM2LONG(vcapa);
1543  long len = 0;
1544  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1545 
1546  if (capa < STR_BUF_MIN_SIZE) {
1547  capa = STR_BUF_MIN_SIZE;
1548  }
1549  if (n == 1) {
1550  StringValue(orig);
1551  len = RSTRING_LEN(orig);
1552  if (capa < len) {
1553  capa = len;
1554  }
1555  if (orig == str) n = 0;
1556  }
1557  str_modifiable(str);
1558  if (STR_EMBED_P(str)) { /* make noembed always */
1559  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + termlen);
1560  }
1561  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1562  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)capa + termlen);
1563  }
1564  RSTRING(str)->as.heap.len = len;
1565  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1566  if (n == 1) {
1567  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1568  rb_enc_cr_str_exact_copy(str, orig);
1569  }
1570  FL_SET(str, STR_NOEMBED);
1571  RSTRING(str)->as.heap.aux.capa = capa;
1572  }
1573  else if (n == 1) {
1574  rb_str_replace(str, orig);
1575  }
1576  if (enc) {
1577  rb_enc_associate(str, enc);
1578  ENC_CODERANGE_CLEAR(str);
1579  }
1580  }
1581  else if (n == 1) {
1582  rb_str_replace(str, orig);
1583  }
1584  return str;
1585 }
1586 
1587 #ifdef NONASCII_MASK
1588 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1589 
1590 /*
1591  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1592  * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1593  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1594  *
1595  * if (!(byte & 0x80))
1596  * byte |= 0x40; // turn on bit6
1597  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1598  *
1599  * This function calculates whether a byte is leading or not for all bytes
1600  * in the argument word by concurrently using the above logic, and then
1601  * adds up the number of leading bytes in the word.
1602  */
1603 static inline uintptr_t
1604 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1605 {
1606  uintptr_t d = *s;
1607 
1608  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1609  d = (d>>6) | (~d>>7);
1610  d &= NONASCII_MASK >> 7;
1611 
1612  /* Gather all bytes. */
1613 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1614  /* use only if it can use POPCNT */
1615  return rb_popcount_intptr(d);
1616 #else
1617  d += (d>>8);
1618  d += (d>>16);
1619 # if SIZEOF_VOIDP == 8
1620  d += (d>>32);
1621 # endif
1622  return (d&0xF);
1623 #endif
1624 }
1625 #endif
1626 
1627 static inline long
1628 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1629 {
1630  long c;
1631  const char *q;
1632 
1633  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1634  long diff = (long)(e - p);
1635  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1636  }
1637 #ifdef NONASCII_MASK
1638  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1639  uintptr_t len = 0;
1640  if ((int)sizeof(uintptr_t) * 2 < e - p) {
1641  const uintptr_t *s, *t;
1642  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1643  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1644  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1645  while (p < (const char *)s) {
1646  if (is_utf8_lead_byte(*p)) len++;
1647  p++;
1648  }
1649  while (s < t) {
1650  len += count_utf8_lead_bytes_with_word(s);
1651  s++;
1652  }
1653  p = (const char *)s;
1654  }
1655  while (p < e) {
1656  if (is_utf8_lead_byte(*p)) len++;
1657  p++;
1658  }
1659  return (long)len;
1660  }
1661 #endif
1662  else if (rb_enc_asciicompat(enc)) {
1663  c = 0;
1664  if (ENC_CODERANGE_CLEAN_P(cr)) {
1665  while (p < e) {
1666  if (ISASCII(*p)) {
1667  q = search_nonascii(p, e);
1668  if (!q)
1669  return c + (e - p);
1670  c += q - p;
1671  p = q;
1672  }
1673  p += rb_enc_fast_mbclen(p, e, enc);
1674  c++;
1675  }
1676  }
1677  else {
1678  while (p < e) {
1679  if (ISASCII(*p)) {
1680  q = search_nonascii(p, e);
1681  if (!q)
1682  return c + (e - p);
1683  c += q - p;
1684  p = q;
1685  }
1686  p += rb_enc_mbclen(p, e, enc);
1687  c++;
1688  }
1689  }
1690  return c;
1691  }
1692 
1693  for (c=0; p<e; c++) {
1694  p += rb_enc_mbclen(p, e, enc);
1695  }
1696  return c;
1697 }
1698 
1699 long
1700 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1701 {
1702  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1703 }
1704 
1705 /* To get strlen with cr
1706  * Note that given cr is not used.
1707  */
1708 long
1709 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1710 {
1711  long c;
1712  const char *q;
1713  int ret;
1714 
1715  *cr = 0;
1716  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1717  long diff = (long)(e - p);
1718  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1719  }
1720  else if (rb_enc_asciicompat(enc)) {
1721  c = 0;
1722  while (p < e) {
1723  if (ISASCII(*p)) {
1724  q = search_nonascii(p, e);
1725  if (!q) {
1726  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1727  return c + (e - p);
1728  }
1729  c += q - p;
1730  p = q;
1731  }
1732  ret = rb_enc_precise_mbclen(p, e, enc);
1733  if (MBCLEN_CHARFOUND_P(ret)) {
1734  *cr |= ENC_CODERANGE_VALID;
1735  p += MBCLEN_CHARFOUND_LEN(ret);
1736  }
1737  else {
1738  *cr = ENC_CODERANGE_BROKEN;
1739  p++;
1740  }
1741  c++;
1742  }
1743  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1744  return c;
1745  }
1746 
1747  for (c=0; p<e; c++) {
1748  ret = rb_enc_precise_mbclen(p, e, enc);
1749  if (MBCLEN_CHARFOUND_P(ret)) {
1750  *cr |= ENC_CODERANGE_VALID;
1751  p += MBCLEN_CHARFOUND_LEN(ret);
1752  }
1753  else {
1754  *cr = ENC_CODERANGE_BROKEN;
1755  if (p + rb_enc_mbminlen(enc) <= e)
1756  p += rb_enc_mbminlen(enc);
1757  else
1758  p = e;
1759  }
1760  }
1761  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1762  return c;
1763 }
1764 
1765 /* enc must be str's enc or rb_enc_check(str, str2) */
1766 static long
1767 str_strlen(VALUE str, rb_encoding *enc)
1768 {
1769  const char *p, *e;
1770  int cr;
1771 
1772  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1773  if (!enc) enc = STR_ENC_GET(str);
1774  p = RSTRING_PTR(str);
1775  e = RSTRING_END(str);
1776  cr = ENC_CODERANGE(str);
1777 
1778  if (cr == ENC_CODERANGE_UNKNOWN) {
1779  long n = rb_enc_strlen_cr(p, e, enc, &cr);
1780  if (cr) ENC_CODERANGE_SET(str, cr);
1781  return n;
1782  }
1783  else {
1784  return enc_strlen(p, e, enc, cr);
1785  }
1786 }
1787 
1788 long
1790 {
1791  return str_strlen(str, NULL);
1792 }
1793 
1794 /*
1795  * call-seq:
1796  * str.length -> integer
1797  * str.size -> integer
1798  *
1799  * Returns the character length of <i>str</i>.
1800  */
1801 
1802 VALUE
1804 {
1805  return LONG2NUM(str_strlen(str, NULL));
1806 }
1807 
1808 /*
1809  * call-seq:
1810  * str.bytesize -> integer
1811  *
1812  * Returns the length of +str+ in bytes.
1813  *
1814  * "\x80\u3042".bytesize #=> 4
1815  * "hello".bytesize #=> 5
1816  */
1817 
1818 static VALUE
1819 rb_str_bytesize(VALUE str)
1820 {
1821  return LONG2NUM(RSTRING_LEN(str));
1822 }
1823 
1824 /*
1825  * call-seq:
1826  * str.empty? -> true or false
1827  *
1828  * Returns <code>true</code> if <i>str</i> has a length of zero.
1829  *
1830  * "hello".empty? #=> false
1831  * " ".empty? #=> false
1832  * "".empty? #=> true
1833  */
1834 
1835 static VALUE
1836 rb_str_empty(VALUE str)
1837 {
1838  if (RSTRING_LEN(str) == 0)
1839  return Qtrue;
1840  return Qfalse;
1841 }
1842 
1843 /*
1844  * call-seq:
1845  * str + other_str -> new_str
1846  *
1847  * Concatenation---Returns a new <code>String</code> containing
1848  * <i>other_str</i> concatenated to <i>str</i>.
1849  *
1850  * "Hello from " + self.to_s #=> "Hello from main"
1851  */
1852 
1853 VALUE
1855 {
1856  VALUE str3;
1857  rb_encoding *enc;
1858  char *ptr1, *ptr2, *ptr3;
1859  long len1, len2;
1860  int termlen;
1861 
1862  StringValue(str2);
1863  enc = rb_enc_check_str(str1, str2);
1864  RSTRING_GETMEM(str1, ptr1, len1);
1865  RSTRING_GETMEM(str2, ptr2, len2);
1866  termlen = rb_enc_mbminlen(enc);
1867  if (len1 > LONG_MAX - len2) {
1868  rb_raise(rb_eArgError, "string size too big");
1869  }
1870  str3 = str_new0(rb_cString, 0, len1+len2, termlen);
1871  ptr3 = RSTRING_PTR(str3);
1872  memcpy(ptr3, ptr1, len1);
1873  memcpy(ptr3+len1, ptr2, len2);
1874  TERM_FILL(&ptr3[len1+len2], termlen);
1875 
1876  FL_SET_RAW(str3, OBJ_TAINTED_RAW(str1) | OBJ_TAINTED_RAW(str2));
1879  RB_GC_GUARD(str1);
1880  RB_GC_GUARD(str2);
1881  return str3;
1882 }
1883 
1884 /*
1885  * call-seq:
1886  * str * integer -> new_str
1887  *
1888  * Copy --- Returns a new String containing +integer+ copies of the receiver.
1889  * +integer+ must be greater than or equal to 0.
1890  *
1891  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1892  * "Ho! " * 0 #=> ""
1893  */
1894 
1895 VALUE
1897 {
1898  VALUE str2;
1899  long n, len;
1900  char *ptr2;
1901  int termlen;
1902 
1903  if (times == INT2FIX(1)) {
1904  return rb_str_dup(str);
1905  }
1906  if (times == INT2FIX(0)) {
1907  str2 = str_alloc(rb_obj_class(str));
1908  rb_enc_copy(str2, str);
1909  OBJ_INFECT(str2, str);
1910  return str2;
1911  }
1912  len = NUM2LONG(times);
1913  if (len < 0) {
1914  rb_raise(rb_eArgError, "negative argument");
1915  }
1916  if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
1917  str2 = str_alloc(rb_obj_class(str));
1918  if (!STR_EMBEDDABLE_P(len, 1)) {
1919  RSTRING(str2)->as.heap.aux.capa = len;
1920  RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
1921  STR_SET_NOEMBED(str2);
1922  }
1923  STR_SET_LEN(str2, len);
1924  rb_enc_copy(str2, str);
1925  OBJ_INFECT(str2, str);
1926  return str2;
1927  }
1928  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1929  rb_raise(rb_eArgError, "argument too big");
1930  }
1931 
1932  len *= RSTRING_LEN(str);
1933  termlen = TERM_LEN(str);
1934  str2 = str_new0(rb_obj_class(str), 0, len, termlen);
1935  ptr2 = RSTRING_PTR(str2);
1936  if (len) {
1937  n = RSTRING_LEN(str);
1938  memcpy(ptr2, RSTRING_PTR(str), n);
1939  while (n <= len/2) {
1940  memcpy(ptr2 + n, ptr2, n);
1941  n *= 2;
1942  }
1943  memcpy(ptr2 + n, ptr2, len-n);
1944  }
1945  STR_SET_LEN(str2, len);
1946  TERM_FILL(&ptr2[len], termlen);
1947  OBJ_INFECT(str2, str);
1948  rb_enc_cr_str_copy_for_substr(str2, str);
1949 
1950  return str2;
1951 }
1952 
1953 /*
1954  * call-seq:
1955  * str % arg -> new_str
1956  *
1957  * Format---Uses <i>str</i> as a format specification, and returns the result
1958  * of applying it to <i>arg</i>. If the format specification contains more than
1959  * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1960  * containing the values to be substituted. See <code>Kernel::sprintf</code> for
1961  * details of the format string.
1962  *
1963  * "%05d" % 123 #=> "00123"
1964  * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
1965  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
1966  */
1967 
1968 static VALUE
1969 rb_str_format_m(VALUE str, VALUE arg)
1970 {
1971  VALUE tmp = rb_check_array_type(arg);
1972 
1973  if (!NIL_P(tmp)) {
1974  VALUE rv = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
1975  RB_GC_GUARD(tmp);
1976  return rv;
1977  }
1978  return rb_str_format(1, &arg, str);
1979 }
1980 
1981 static inline void
1982 rb_check_lockedtmp(VALUE str)
1983 {
1984  if (FL_TEST(str, STR_TMPLOCK)) {
1985  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1986  }
1987 }
1988 
1989 static inline void
1990 str_modifiable(VALUE str)
1991 {
1992  rb_check_lockedtmp(str);
1993  rb_check_frozen(str);
1994 }
1995 
1996 static inline int
1997 str_dependent_p(VALUE str)
1998 {
1999  if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2000  return 0;
2001  }
2002  else {
2003  return 1;
2004  }
2005 }
2006 
2007 static inline int
2008 str_independent(VALUE str)
2009 {
2010  str_modifiable(str);
2011  return !str_dependent_p(str);
2012 }
2013 
2014 static void
2015 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2016 {
2017  char *ptr;
2018  const char *oldptr;
2019  long capa = len + expand;
2020 
2021  if (len > capa) len = capa;
2022 
2023  if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
2024  ptr = RSTRING(str)->as.heap.ptr;
2025  STR_SET_EMBED(str);
2026  memcpy(RSTRING(str)->as.ary, ptr, len);
2027  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2028  STR_SET_EMBED_LEN(str, len);
2029  return;
2030  }
2031 
2032  ptr = ALLOC_N(char, (size_t)capa + termlen);
2033  oldptr = RSTRING_PTR(str);
2034  if (oldptr) {
2035  memcpy(ptr, oldptr, len);
2036  }
2037  STR_SET_NOEMBED(str);
2039  TERM_FILL(ptr + len, termlen);
2040  RSTRING(str)->as.heap.ptr = ptr;
2041  RSTRING(str)->as.heap.len = len;
2042  RSTRING(str)->as.heap.aux.capa = capa;
2043 }
2044 
2045 void
2047 {
2048  if (!str_independent(str))
2049  str_make_independent(str);
2050  ENC_CODERANGE_CLEAR(str);
2051 }
2052 
2053 void
2054 rb_str_modify_expand(VALUE str, long expand)
2055 {
2056  int termlen = TERM_LEN(str);
2057  long len = RSTRING_LEN(str);
2058 
2059  if (expand < 0) {
2060  rb_raise(rb_eArgError, "negative expanding string size");
2061  }
2062  if (expand > LONG_MAX - len) {
2063  rb_raise(rb_eArgError, "string size too big");
2064  }
2065 
2066  if (!str_independent(str)) {
2067  str_make_independent_expand(str, len, expand, termlen);
2068  }
2069  else if (expand > 0) {
2070  RESIZE_CAPA_TERM(str, len + expand, termlen);
2071  }
2072  ENC_CODERANGE_CLEAR(str);
2073 }
2074 
2075 /* As rb_str_modify(), but don't clear coderange */
2076 static void
2077 str_modify_keep_cr(VALUE str)
2078 {
2079  if (!str_independent(str))
2080  str_make_independent(str);
2081  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2082  /* Force re-scan later */
2083  ENC_CODERANGE_CLEAR(str);
2084 }
2085 
2086 static inline void
2087 str_discard(VALUE str)
2088 {
2089  str_modifiable(str);
2090  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2092  RSTRING(str)->as.heap.ptr = 0;
2093  RSTRING(str)->as.heap.len = 0;
2094  }
2095 }
2096 
2097 void
2099 {
2100  rb_encoding *enc = rb_enc_get(str);
2101  if (!rb_enc_asciicompat(enc)) {
2102  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2103  }
2104 }
2105 
2106 VALUE
2107 rb_string_value(volatile VALUE *ptr)
2108 {
2109  VALUE s = *ptr;
2110  if (!RB_TYPE_P(s, T_STRING)) {
2111  s = rb_str_to_str(s);
2112  *ptr = s;
2113  }
2114  return s;
2115 }
2116 
2117 char *
2119 {
2120  VALUE str = rb_string_value(ptr);
2121  return RSTRING_PTR(str);
2122 }
2123 
2124 static int
2125 zero_filled(const char *s, int n)
2126 {
2127  for (; n > 0; --n) {
2128  if (*s++) return 0;
2129  }
2130  return 1;
2131 }
2132 
2133 static const char *
2134 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2135 {
2136  const char *e = s + len;
2137 
2138  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2139  if (zero_filled(s, minlen)) return s;
2140  }
2141  return 0;
2142 }
2143 
2144 static char *
2145 str_fill_term(VALUE str, char *s, long len, int termlen)
2146 {
2147  /* This function assumes that (capa + termlen) bytes of memory
2148  * is allocated, like many other functions in this file.
2149  */
2150  if (str_dependent_p(str)) {
2151  if (!zero_filled(s + len, termlen))
2152  str_make_independent_expand(str, len, 0L, termlen);
2153  }
2154  else {
2155  TERM_FILL(s + len, termlen);
2156  return s;
2157  }
2158  return RSTRING_PTR(str);
2159 }
2160 
2161 void
2162 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2163 {
2164  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2165  long len = RSTRING_LEN(str);
2166 
2167  assert(capa >= len);
2168  if (capa - len < termlen) {
2169  rb_check_lockedtmp(str);
2170  str_make_independent_expand(str, len, 0L, termlen);
2171  }
2172  else if (str_dependent_p(str)) {
2173  if (termlen > oldtermlen)
2174  str_make_independent_expand(str, len, 0L, termlen);
2175  }
2176  else {
2177  if (!STR_EMBED_P(str)) {
2178  /* modify capa instead of realloc */
2179  assert(!FL_TEST((str), STR_SHARED));
2180  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2181  }
2182  if (termlen > oldtermlen) {
2183  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2184  }
2185  }
2186 
2187  return;
2188 }
2189 
2190 static char *
2191 str_null_check(VALUE str, int *w)
2192 {
2193  char *s = RSTRING_PTR(str);
2194  long len = RSTRING_LEN(str);
2195  rb_encoding *enc = rb_enc_get(str);
2196  const int minlen = rb_enc_mbminlen(enc);
2197 
2198  if (minlen > 1) {
2199  *w = 1;
2200  if (str_null_char(s, len, minlen, enc)) {
2201  return NULL;
2202  }
2203  return str_fill_term(str, s, len, minlen);
2204  }
2205  *w = 0;
2206  if (!s || memchr(s, 0, len)) {
2207  return NULL;
2208  }
2209  if (s[len]) {
2210  s = str_fill_term(str, s, len, minlen);
2211  }
2212  return s;
2213 }
2214 
2215 char *
2217 {
2218  int w;
2219  return str_null_check(str, &w);
2220 }
2221 
2222 char *
2224 {
2225  VALUE str = rb_string_value(ptr);
2226  int w;
2227  char *s = str_null_check(str, &w);
2228  if (!s) {
2229  if (w) {
2230  rb_raise(rb_eArgError, "string contains null char");
2231  }
2232  rb_raise(rb_eArgError, "string contains null byte");
2233  }
2234  return s;
2235 }
2236 
2237 char *
2238 rb_str_fill_terminator(VALUE str, const int newminlen)
2239 {
2240  char *s = RSTRING_PTR(str);
2241  long len = RSTRING_LEN(str);
2242  return str_fill_term(str, s, len, newminlen);
2243 }
2244 
2245 VALUE
2247 {
2248  str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2249  return str;
2250 }
2251 
2252 /*
2253  * call-seq:
2254  * String.try_convert(obj) -> string or nil
2255  *
2256  * Try to convert <i>obj</i> into a String, using to_str method.
2257  * Returns converted string or nil if <i>obj</i> cannot be converted
2258  * for any reason.
2259  *
2260  * String.try_convert("str") #=> "str"
2261  * String.try_convert(/re/) #=> nil
2262  */
2263 static VALUE
2264 rb_str_s_try_convert(VALUE dummy, VALUE str)
2265 {
2266  return rb_check_string_type(str);
2267 }
2268 
2269 static char*
2270 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2271 {
2272  long nth = *nthp;
2273  if (rb_enc_mbmaxlen(enc) == 1) {
2274  p += nth;
2275  }
2276  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2277  p += nth * rb_enc_mbmaxlen(enc);
2278  }
2279  else if (rb_enc_asciicompat(enc)) {
2280  const char *p2, *e2;
2281  int n;
2282 
2283  while (p < e && 0 < nth) {
2284  e2 = p + nth;
2285  if (e < e2) {
2286  *nthp = nth;
2287  return (char *)e;
2288  }
2289  if (ISASCII(*p)) {
2290  p2 = search_nonascii(p, e2);
2291  if (!p2) {
2292  nth -= e2 - p;
2293  *nthp = nth;
2294  return (char *)e2;
2295  }
2296  nth -= p2 - p;
2297  p = p2;
2298  }
2299  n = rb_enc_mbclen(p, e, enc);
2300  p += n;
2301  nth--;
2302  }
2303  *nthp = nth;
2304  if (nth != 0) {
2305  return (char *)e;
2306  }
2307  return (char *)p;
2308  }
2309  else {
2310  while (p < e && nth--) {
2311  p += rb_enc_mbclen(p, e, enc);
2312  }
2313  }
2314  if (p > e) p = e;
2315  *nthp = nth;
2316  return (char*)p;
2317 }
2318 
2319 char*
2320 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2321 {
2322  return str_nth_len(p, e, &nth, enc);
2323 }
2324 
2325 static char*
2326 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2327 {
2328  if (singlebyte)
2329  p += nth;
2330  else {
2331  p = str_nth_len(p, e, &nth, enc);
2332  }
2333  if (!p) return 0;
2334  if (p > e) p = e;
2335  return (char *)p;
2336 }
2337 
2338 /* char offset to byte offset */
2339 static long
2340 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2341 {
2342  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2343  if (!pp) return e - p;
2344  return pp - p;
2345 }
2346 
2347 long
2348 rb_str_offset(VALUE str, long pos)
2349 {
2350  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2351  STR_ENC_GET(str), single_byte_optimizable(str));
2352 }
2353 
2354 #ifdef NONASCII_MASK
2355 static char *
2356 str_utf8_nth(const char *p, const char *e, long *nthp)
2357 {
2358  long nth = *nthp;
2359  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2360  const uintptr_t *s, *t;
2361  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2362  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2363  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2364  while (p < (const char *)s) {
2365  if (is_utf8_lead_byte(*p)) nth--;
2366  p++;
2367  }
2368  do {
2369  nth -= count_utf8_lead_bytes_with_word(s);
2370  s++;
2371  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2372  p = (char *)s;
2373  }
2374  while (p < e) {
2375  if (is_utf8_lead_byte(*p)) {
2376  if (nth == 0) break;
2377  nth--;
2378  }
2379  p++;
2380  }
2381  *nthp = nth;
2382  return (char *)p;
2383 }
2384 
2385 static long
2386 str_utf8_offset(const char *p, const char *e, long nth)
2387 {
2388  const char *pp = str_utf8_nth(p, e, &nth);
2389  return pp - p;
2390 }
2391 #endif
2392 
2393 /* byte offset to char offset */
2394 long
2395 rb_str_sublen(VALUE str, long pos)
2396 {
2397  if (single_byte_optimizable(str) || pos < 0)
2398  return pos;
2399  else {
2400  char *p = RSTRING_PTR(str);
2401  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2402  }
2403 }
2404 
2405 VALUE
2406 rb_str_subseq(VALUE str, long beg, long len)
2407 {
2408  VALUE str2;
2409 
2410  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2411  SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2412  long olen;
2413  str2 = rb_str_new_shared(rb_str_new_frozen(str));
2414  RSTRING(str2)->as.heap.ptr += beg;
2415  olen = RSTRING(str2)->as.heap.len;
2416  if (olen > len) RSTRING(str2)->as.heap.len = len;
2417  }
2418  else {
2419  str2 = rb_str_new_with_class(str, RSTRING_PTR(str)+beg, len);
2420  RB_GC_GUARD(str);
2421  }
2422 
2423  rb_enc_cr_str_copy_for_substr(str2, str);
2424  OBJ_INFECT(str2, str);
2425 
2426  return str2;
2427 }
2428 
2429 char *
2430 rb_str_subpos(VALUE str, long beg, long *lenp)
2431 {
2432  long len = *lenp;
2433  long slen = -1L;
2434  long blen = RSTRING_LEN(str);
2435  rb_encoding *enc = STR_ENC_GET(str);
2436  char *p, *s = RSTRING_PTR(str), *e = s + blen;
2437 
2438  if (len < 0) return 0;
2439  if (!blen) {
2440  len = 0;
2441  }
2442  if (single_byte_optimizable(str)) {
2443  if (beg > blen) return 0;
2444  if (beg < 0) {
2445  beg += blen;
2446  if (beg < 0) return 0;
2447  }
2448  if (len > blen - beg)
2449  len = blen - beg;
2450  if (len < 0) return 0;
2451  p = s + beg;
2452  goto end;
2453  }
2454  if (beg < 0) {
2455  if (len > -beg) len = -beg;
2456  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2457  beg = -beg;
2458  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2459  p = e;
2460  if (!p) return 0;
2461  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2462  if (!p) return 0;
2463  len = e - p;
2464  goto end;
2465  }
2466  else {
2467  slen = str_strlen(str, enc);
2468  beg += slen;
2469  if (beg < 0) return 0;
2470  p = s + beg;
2471  if (len == 0) goto end;
2472  }
2473  }
2474  else if (beg > 0 && beg > RSTRING_LEN(str)) {
2475  return 0;
2476  }
2477  if (len == 0) {
2478  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2479  p = s + beg;
2480  }
2481 #ifdef NONASCII_MASK
2482  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2483  enc == rb_utf8_encoding()) {
2484  p = str_utf8_nth(s, e, &beg);
2485  if (beg > 0) return 0;
2486  len = str_utf8_offset(p, e, len);
2487  }
2488 #endif
2489  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2490  int char_sz = rb_enc_mbmaxlen(enc);
2491 
2492  p = s + beg * char_sz;
2493  if (p > e) {
2494  return 0;
2495  }
2496  else if (len * char_sz > e - p)
2497  len = e - p;
2498  else
2499  len *= char_sz;
2500  }
2501  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2502  if (beg > 0) return 0;
2503  len = 0;
2504  }
2505  else {
2506  len = str_offset(p, e, len, enc, 0);
2507  }
2508  end:
2509  *lenp = len;
2510  RB_GC_GUARD(str);
2511  return p;
2512 }
2513 
2514 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2515 
2516 VALUE
2517 rb_str_substr(VALUE str, long beg, long len)
2518 {
2519  return str_substr(str, beg, len, TRUE);
2520 }
2521 
2522 static VALUE
2523 str_substr(VALUE str, long beg, long len, int empty)
2524 {
2525  VALUE str2;
2526  char *p = rb_str_subpos(str, beg, &len);
2527 
2528  if (!p) return Qnil;
2529  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2530  SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2531  long ofs = p - RSTRING_PTR(str);
2532  str2 = rb_str_new_frozen(str);
2533  str2 = str_new_shared(rb_obj_class(str2), str2);
2534  RSTRING(str2)->as.heap.ptr += ofs;
2535  RSTRING(str2)->as.heap.len = len;
2536  }
2537  else {
2538  if (!len && !empty) return Qnil;
2539  str2 = rb_str_new_with_class(str, p, len);
2540  OBJ_INFECT(str2, str);
2541  RB_GC_GUARD(str);
2542  }
2543  rb_enc_cr_str_copy_for_substr(str2, str);
2544 
2545  return str2;
2546 }
2547 
2548 VALUE
2550 {
2551  if (OBJ_FROZEN(str)) return str;
2552  rb_str_resize(str, RSTRING_LEN(str));
2553  return rb_obj_freeze(str);
2554 }
2555 
2556 
2557 /*
2558  * call-seq:
2559  * +str -> str (mutable)
2560  *
2561  * If the string is frozen, then return duplicated mutable string.
2562  *
2563  * If the string is not frozen, then return the string itself.
2564  */
2565 static VALUE
2566 str_uplus(VALUE str)
2567 {
2568  if (OBJ_FROZEN(str)) {
2569  return rb_str_dup(str);
2570  }
2571  else {
2572  return str;
2573  }
2574 }
2575 
2576 /*
2577  * call-seq:
2578  * -str -> str (frozen)
2579  *
2580  * If the string is frozen, then return the string itself.
2581  *
2582  * If the string is not frozen, return a frozen, possibly pre-existing
2583  * copy of it.
2584  */
2585 static VALUE
2586 str_uminus(VALUE str)
2587 {
2588  if (OBJ_FROZEN(str)) {
2589  return str;
2590  }
2591  else {
2592  return rb_fstring(str);
2593  }
2594 }
2595 
2597 #define rb_str_dup_frozen rb_str_new_frozen
2598 
2599 VALUE
2600 rb_str_locktmp(VALUE str)
2601 {
2602  if (FL_TEST(str, STR_TMPLOCK)) {
2603  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2604  }
2605  FL_SET(str, STR_TMPLOCK);
2606  return str;
2607 }
2608 
2609 VALUE
2611 {
2612  if (!FL_TEST(str, STR_TMPLOCK)) {
2613  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2614  }
2615  FL_UNSET(str, STR_TMPLOCK);
2616  return str;
2617 }
2618 
2621 {
2622  rb_str_locktmp(str);
2623  return rb_ensure(func, arg, rb_str_unlocktmp, str);
2624 }
2625 
2626 void
2627 rb_str_set_len(VALUE str, long len)
2628 {
2629  long capa;
2630  const int termlen = TERM_LEN(str);
2631 
2632  str_modifiable(str);
2633  if (STR_SHARED_P(str)) {
2634  rb_raise(rb_eRuntimeError, "can't set length of shared string");
2635  }
2636  if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
2637  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2638  }
2639  STR_SET_LEN(str, len);
2640  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2641 }
2642 
2643 VALUE
2644 rb_str_resize(VALUE str, long len)
2645 {
2646  long slen;
2647  int independent;
2648 
2649  if (len < 0) {
2650  rb_raise(rb_eArgError, "negative string size (or size too big)");
2651  }
2652 
2653  independent = str_independent(str);
2654  ENC_CODERANGE_CLEAR(str);
2655  slen = RSTRING_LEN(str);
2656 
2657  {
2658  long capa;
2659  const int termlen = TERM_LEN(str);
2660  if (STR_EMBED_P(str)) {
2661  if (len == slen) return str;
2662  if (STR_EMBEDDABLE_P(len, termlen)) {
2663  STR_SET_EMBED_LEN(str, len);
2664  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2665  return str;
2666  }
2667  str_make_independent_expand(str, slen, len - slen, termlen);
2668  }
2669  else if (STR_EMBEDDABLE_P(len, termlen)) {
2670  char *ptr = STR_HEAP_PTR(str);
2671  STR_SET_EMBED(str);
2672  if (slen > len) slen = len;
2673  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2674  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2675  STR_SET_EMBED_LEN(str, len);
2676  if (independent) ruby_xfree(ptr);
2677  return str;
2678  }
2679  else if (!independent) {
2680  if (len == slen) return str;
2681  str_make_independent_expand(str, slen, len - slen, termlen);
2682  }
2683  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2684  (capa - len) > (len < 1024 ? len : 1024)) {
2685  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)len + termlen);
2686  RSTRING(str)->as.heap.aux.capa = len;
2687  }
2688  else if (len == slen) return str;
2689  RSTRING(str)->as.heap.len = len;
2690  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2691  }
2692  return str;
2693 }
2694 
2695 static VALUE
2696 str_buf_cat(VALUE str, const char *ptr, long len)
2697 {
2698  long capa, total, olen, off = -1;
2699  char *sptr;
2700  const int termlen = TERM_LEN(str);
2701  assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2702 
2703  RSTRING_GETMEM(str, sptr, olen);
2704  if (ptr >= sptr && ptr <= sptr + olen) {
2705  off = ptr - sptr;
2706  }
2707  rb_str_modify(str);
2708  if (len == 0) return 0;
2709  if (STR_EMBED_P(str)) {
2710  capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2711  sptr = RSTRING(str)->as.ary;
2712  olen = RSTRING_EMBED_LEN(str);
2713  }
2714  else {
2715  capa = RSTRING(str)->as.heap.aux.capa;
2716  sptr = RSTRING(str)->as.heap.ptr;
2717  olen = RSTRING(str)->as.heap.len;
2718  }
2719  if (olen > LONG_MAX - len) {
2720  rb_raise(rb_eArgError, "string sizes too big");
2721  }
2722  total = olen + len;
2723  if (capa < total) {
2724  if (total >= LONG_MAX / 2) {
2725  capa = total;
2726  }
2727  while (total > capa) {
2728  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2729  }
2730  RESIZE_CAPA_TERM(str, capa, termlen);
2731  sptr = RSTRING_PTR(str);
2732  }
2733  if (off != -1) {
2734  ptr = sptr + off;
2735  }
2736  memcpy(sptr + olen, ptr, len);
2737  STR_SET_LEN(str, total);
2738  TERM_FILL(sptr + total, termlen); /* sentinel */
2739 
2740  return str;
2741 }
2742 
2743 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2744 
2745 VALUE
2746 rb_str_cat(VALUE str, const char *ptr, long len)
2747 {
2748  if (len == 0) return str;
2749  if (len < 0) {
2750  rb_raise(rb_eArgError, "negative string size (or size too big)");
2751  }
2752  return str_buf_cat(str, ptr, len);
2753 }
2754 
2755 VALUE
2756 rb_str_cat_cstr(VALUE str, const char *ptr)
2757 {
2758  must_not_null(ptr);
2759  return rb_str_buf_cat(str, ptr, strlen(ptr));
2760 }
2761 
2762 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
2763 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2764 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2765 
2766 static VALUE
2767 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2768  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2769 {
2770  int str_encindex = ENCODING_GET(str);
2771  int res_encindex;
2772  int str_cr, res_cr;
2773  rb_encoding *str_enc, *ptr_enc;
2774 
2775  str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
2776 
2777  if (str_encindex == ptr_encindex) {
2778  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2779  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2780  }
2781  }
2782  else {
2783  str_enc = rb_enc_from_index(str_encindex);
2784  ptr_enc = rb_enc_from_index(ptr_encindex);
2785  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2786  if (len == 0)
2787  return str;
2788  if (RSTRING_LEN(str) == 0) {
2789  rb_str_buf_cat(str, ptr, len);
2790  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2791  return str;
2792  }
2793  goto incompatible;
2794  }
2795  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2796  ptr_cr = coderange_scan(ptr, len, ptr_enc);
2797  }
2798  if (str_cr == ENC_CODERANGE_UNKNOWN) {
2799  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2800  str_cr = rb_enc_str_coderange(str);
2801  }
2802  }
2803  }
2804  if (ptr_cr_ret)
2805  *ptr_cr_ret = ptr_cr;
2806 
2807  if (str_encindex != ptr_encindex &&
2808  str_cr != ENC_CODERANGE_7BIT &&
2809  ptr_cr != ENC_CODERANGE_7BIT) {
2810  str_enc = rb_enc_from_index(str_encindex);
2811  ptr_enc = rb_enc_from_index(ptr_encindex);
2812  incompatible:
2813  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2814  rb_enc_name(str_enc), rb_enc_name(ptr_enc));
2815  }
2816 
2817  if (str_cr == ENC_CODERANGE_UNKNOWN) {
2818  res_encindex = str_encindex;
2819  res_cr = ENC_CODERANGE_UNKNOWN;
2820  }
2821  else if (str_cr == ENC_CODERANGE_7BIT) {
2822  if (ptr_cr == ENC_CODERANGE_7BIT) {
2823  res_encindex = str_encindex;
2824  res_cr = ENC_CODERANGE_7BIT;
2825  }
2826  else {
2827  res_encindex = ptr_encindex;
2828  res_cr = ptr_cr;
2829  }
2830  }
2831  else if (str_cr == ENC_CODERANGE_VALID) {
2832  res_encindex = str_encindex;
2833  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
2834  res_cr = str_cr;
2835  else
2836  res_cr = ptr_cr;
2837  }
2838  else { /* str_cr == ENC_CODERANGE_BROKEN */
2839  res_encindex = str_encindex;
2840  res_cr = str_cr;
2841  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2842  }
2843 
2844  if (len < 0) {
2845  rb_raise(rb_eArgError, "negative string size (or size too big)");
2846  }
2847  str_buf_cat(str, ptr, len);
2848  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2849  return str;
2850 }
2851 
2852 VALUE
2853 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2854 {
2855  return rb_enc_cr_str_buf_cat(str, ptr, len,
2857 }
2858 
2859 VALUE
2860 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2861 {
2862  /* ptr must reference NUL terminated ASCII string. */
2863  int encindex = ENCODING_GET(str);
2864  rb_encoding *enc = rb_enc_from_index(encindex);
2865  if (rb_enc_asciicompat(enc)) {
2866  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2867  encindex, ENC_CODERANGE_7BIT, 0);
2868  }
2869  else {
2870  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2871  while (*ptr) {
2872  unsigned int c = (unsigned char)*ptr;
2873  int len = rb_enc_codelen(c, enc);
2874  rb_enc_mbcput(c, buf, enc);
2875  rb_enc_cr_str_buf_cat(str, buf, len,
2876  encindex, ENC_CODERANGE_VALID, 0);
2877  ptr++;
2878  }
2879  return str;
2880  }
2881 }
2882 
2883 VALUE
2885 {
2886  int str2_cr;
2887 
2888  str2_cr = ENC_CODERANGE(str2);
2889 
2890  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2891  ENCODING_GET(str2), str2_cr, &str2_cr);
2892 
2893  OBJ_INFECT(str, str2);
2894  ENC_CODERANGE_SET(str2, str2_cr);
2895 
2896  return str;
2897 }
2898 
2899 VALUE
2901 {
2902  StringValue(str2);
2903  return rb_str_buf_append(str, str2);
2904 }
2905 
2906 #define MIN_PRE_ALLOC_SIZE 48
2907 
2908 VALUE
2909 rb_str_concat_literals(size_t num, const VALUE *strary)
2910 {
2911  VALUE str;
2912  size_t i, s;
2913  long len = 1;
2914 
2915  if (UNLIKELY(!num)) return rb_str_new(0, 0);
2916  if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
2917 
2918  for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
2919  if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
2920  str = rb_str_resurrect(strary[0]);
2921  s = 1;
2922  }
2923  else {
2924  str = rb_str_buf_new(len);
2925  rb_enc_copy(str, strary[0]);
2926  s = 0;
2927  }
2928 
2929  for (i = s; i < num; ++i) {
2930  const VALUE v = strary[i];
2931  int encidx = ENCODING_GET(v);
2932 
2933  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
2934  encidx, ENC_CODERANGE(v), NULL);
2935  OBJ_INFECT_RAW(str, v);
2936  if (encidx != ENCINDEX_US_ASCII) {
2938  rb_enc_set_index(str, encidx);
2939  }
2940  }
2941  return str;
2942 }
2943 
2944 /*
2945  * call-seq:
2946  * str.concat(obj1, obj2,...) -> str
2947  * str.concat(integer1, integer2,...) -> str
2948  *
2949  * Concatenates the given object(s) to <i>str</i>. If an object is an
2950  * <code>Integer</code>, it is considered a codepoint and converted
2951  * to a character before concatenation.
2952  *
2953  * +concat+ can take multiple arguments, and all the arguments are concatenated
2954  * in order. See String#<<, which takes a single argument.
2955  *
2956  * a = "hello "
2957  * a.concat("world", 33) #=> "hello world!"
2958  *
2959  * b = "sn"
2960  * b.concat("_", b, "_", b) #=> "sn_sn_sn"
2961  */
2962 static VALUE
2963 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
2964 {
2965  str_modifiable(str);
2966 
2967  if (argc == 1) {
2968  return rb_str_concat(str, argv[0]);
2969  }
2970  else if (argc > 1) {
2971  int i;
2972  VALUE arg_str = rb_str_tmp_new(0);
2973  rb_enc_copy(arg_str, str);
2974  for (i = 0; i < argc; i++) {
2975  rb_str_concat(arg_str, argv[i]);
2976  }
2977  rb_str_buf_append(str, arg_str);
2978  }
2979 
2980  return str;
2981 }
2982 
2983 /*
2984  * call-seq:
2985  * str << obj -> str
2986  * str << integer -> str
2987  *
2988  * Append the given object to <i>str</i>. If the object is an
2989  * <code>Integer</code>, it is considered a codepoint and converted
2990  * to a character before being appended.
2991  *
2992  * See String#concat, which takes multiple arguments.
2993  *
2994  * a = "hello "
2995  * a << "world" #=> "hello world"
2996  * a << 33 #=> "hello world!"
2997  */
2998 VALUE
3000 {
3001  unsigned int code;
3002  rb_encoding *enc = STR_ENC_GET(str1);
3003  int encidx;
3004 
3005  if (RB_INTEGER_TYPE_P(str2)) {
3006  if (rb_num_to_uint(str2, &code) == 0) {
3007  }
3008  else if (FIXNUM_P(str2)) {
3009  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3010  }
3011  else {
3012  rb_raise(rb_eRangeError, "bignum out of char range");
3013  }
3014  }
3015  else {
3016  return rb_str_append(str1, str2);
3017  }
3018 
3019  encidx = rb_enc_to_index(enc);
3020  if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3021  /* US-ASCII automatically extended to ASCII-8BIT */
3022  char buf[1];
3023  buf[0] = (char)code;
3024  if (code > 0xFF) {
3025  rb_raise(rb_eRangeError, "%u out of char range", code);
3026  }
3027  rb_str_cat(str1, buf, 1);
3028  if (encidx == ENCINDEX_US_ASCII && code > 127) {
3031  }
3032  }
3033  else {
3034  long pos = RSTRING_LEN(str1);
3035  int cr = ENC_CODERANGE(str1);
3036  int len;
3037  char *buf;
3038 
3039  switch (len = rb_enc_codelen(code, enc)) {
3041  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3042  break;
3044  case 0:
3045  rb_raise(rb_eRangeError, "%u out of char range", code);
3046  break;
3047  }
3048  buf = ALLOCA_N(char, len + 1);
3049  rb_enc_mbcput(code, buf, enc);
3050  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3051  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3052  }
3053  rb_str_resize(str1, pos+len);
3054  memcpy(RSTRING_PTR(str1) + pos, buf, len);
3055  if (cr == ENC_CODERANGE_7BIT && code > 127)
3056  cr = ENC_CODERANGE_VALID;
3057  ENC_CODERANGE_SET(str1, cr);
3058  }
3059  return str1;
3060 }
3061 
3062 /*
3063  * call-seq:
3064  * str.prepend(other_str1, other_str2,...) -> str
3065  *
3066  * Prepend---Prepend the given strings to <i>str</i>.
3067  *
3068  * a = "!"
3069  * a.prepend("hello ", "world") #=> "hello world!"
3070  * a #=> "hello world!"
3071  *
3072  * See also String#concat.
3073  */
3074 
3075 static VALUE
3076 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3077 {
3078  str_modifiable(str);
3079 
3080  if (argc > 0) {
3081  int i;
3082  VALUE arg_str = rb_str_tmp_new(0);
3083  rb_enc_copy(arg_str, str);
3084  for (i = 0; i < argc; i++) {
3085  rb_str_append(arg_str, argv[i]);
3086  }
3087  rb_str_update(str, 0L, 0L, arg_str);
3088  }
3089 
3090  return str;
3091 }
3092 
3093 st_index_t
3095 {
3096  int e = ENCODING_GET(str);
3097  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3098  e = 0;
3099  }
3100  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3101 }
3102 
3103 int
3105 {
3106  long len1, len2;
3107  const char *ptr1, *ptr2;
3108  RSTRING_GETMEM(str1, ptr1, len1);
3109  RSTRING_GETMEM(str2, ptr2, len2);
3110  return (len1 != len2 ||
3111  !rb_str_comparable(str1, str2) ||
3112  memcmp(ptr1, ptr2, len1) != 0);
3113 }
3114 
3115 /*
3116  * call-seq:
3117  * str.hash -> integer
3118  *
3119  * Return a hash based on the string's length, content and encoding.
3120  *
3121  * See also Object#hash.
3122  */
3123 
3124 static VALUE
3125 rb_str_hash_m(VALUE str)
3126 {
3127  st_index_t hval = rb_str_hash(str);
3128  return ST2FIX(hval);
3129 }
3130 
3131 #define lesser(a,b) (((a)>(b))?(b):(a))
3132 
3133 int
3135 {
3136  int idx1, idx2;
3137  int rc1, rc2;
3138 
3139  if (RSTRING_LEN(str1) == 0) return TRUE;
3140  if (RSTRING_LEN(str2) == 0) return TRUE;
3141  idx1 = ENCODING_GET(str1);
3142  idx2 = ENCODING_GET(str2);
3143  if (idx1 == idx2) return TRUE;
3144  rc1 = rb_enc_str_coderange(str1);
3145  rc2 = rb_enc_str_coderange(str2);
3146  if (rc1 == ENC_CODERANGE_7BIT) {
3147  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3149  return TRUE;
3150  }
3151  if (rc2 == ENC_CODERANGE_7BIT) {
3153  return TRUE;
3154  }
3155  return FALSE;
3156 }
3157 
3158 int
3160 {
3161  long len1, len2;
3162  const char *ptr1, *ptr2;
3163  int retval;
3164 
3165  if (str1 == str2) return 0;
3166  RSTRING_GETMEM(str1, ptr1, len1);
3167  RSTRING_GETMEM(str2, ptr2, len2);
3168  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3169  if (len1 == len2) {
3170  if (!rb_str_comparable(str1, str2)) {
3171  if (ENCODING_GET(str1) > ENCODING_GET(str2))
3172  return 1;
3173  return -1;
3174  }
3175  return 0;
3176  }
3177  if (len1 > len2) return 1;
3178  return -1;
3179  }
3180  if (retval > 0) return 1;
3181  return -1;
3182 }
3183 
3184 /* expect tail call optimization */
3185 static VALUE
3186 str_eql(const VALUE str1, const VALUE str2)
3187 {
3188  const long len = RSTRING_LEN(str1);
3189  const char *ptr1, *ptr2;
3190 
3191  if (len != RSTRING_LEN(str2)) return Qfalse;
3192  if (!rb_str_comparable(str1, str2)) return Qfalse;
3193  if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
3194  return Qtrue;
3195  if (memcmp(ptr1, ptr2, len) == 0)
3196  return Qtrue;
3197  return Qfalse;
3198 }
3199 
3200 /*
3201  * call-seq:
3202  * str == obj -> true or false
3203  * str === obj -> true or false
3204  *
3205  * Equality---Returns whether +str+ == +obj+, similar to Object#==.
3206  *
3207  * If +obj+ is not an instance of String but responds to +to_str+, then the
3208  * two strings are compared using <code>obj.==</code>.
3209  *
3210  * Otherwise, returns similarly to String#eql?, comparing length and content.
3211  */
3212 
3213 VALUE
3215 {
3216  if (str1 == str2) return Qtrue;
3217  if (!RB_TYPE_P(str2, T_STRING)) {
3218  if (!rb_respond_to(str2, idTo_str)) {
3219  return Qfalse;
3220  }
3221  return rb_equal(str2, str1);
3222  }
3223  return str_eql(str1, str2);
3224 }
3225 
3226 /*
3227  * call-seq:
3228  * str.eql?(other) -> true or false
3229  *
3230  * Two strings are equal if they have the same length and content.
3231  */
3232 
3233 VALUE
3235 {
3236  if (str1 == str2) return Qtrue;
3237  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3238  return str_eql(str1, str2);
3239 }
3240 
3241 /*
3242  * call-seq:
3243  * string <=> other_string -> -1, 0, +1, or nil
3244  *
3245  * Comparison---Returns -1, 0, +1, or +nil+ depending on whether +string+ is
3246  * less than, equal to, or greater than +other_string+.
3247  *
3248  * +nil+ is returned if the two values are incomparable.
3249  *
3250  * If the strings are of different lengths, and the strings are equal when
3251  * compared up to the shortest length, then the longer string is considered
3252  * greater than the shorter one.
3253  *
3254  * <code><=></code> is the basis for the methods <code><</code>,
3255  * <code><=</code>, <code>></code>, <code>>=</code>, and
3256  * <code>between?</code>, included from module Comparable. The method
3257  * String#== does not use Comparable#==.
3258  *
3259  * "abcdef" <=> "abcde" #=> 1
3260  * "abcdef" <=> "abcdef" #=> 0
3261  * "abcdef" <=> "abcdefg" #=> -1
3262  * "abcdef" <=> "ABCDEF" #=> 1
3263  * "abcdef" <=> 1 #=> nil
3264  */
3265 
3266 static VALUE
3267 rb_str_cmp_m(VALUE str1, VALUE str2)
3268 {
3269  int result;
3270  VALUE s = rb_check_string_type(str2);
3271  if (NIL_P(s)) {
3272  return rb_invcmp(str1, str2);
3273  }
3274  result = rb_str_cmp(str1, s);
3275  return INT2FIX(result);
3276 }
3277 
3278 static VALUE str_casecmp(VALUE str1, VALUE str2);
3279 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3280 
3281 /*
3282  * call-seq:
3283  * str.casecmp(other_str) -> -1, 0, +1, or nil
3284  *
3285  * Case-insensitive version of <code>String#<=></code>.
3286  * Currently, case-insensitivity only works on characters A-Z/a-z,
3287  * not all of Unicode. This is different from String#casecmp?.
3288  *
3289  * "aBcDeF".casecmp("abcde") #=> 1
3290  * "aBcDeF".casecmp("abcdef") #=> 0
3291  * "aBcDeF".casecmp("abcdefg") #=> -1
3292  * "abcdef".casecmp("ABCDEF") #=> 0
3293  *
3294  * +nil+ is returned if the two strings have incompatible encodings,
3295  * or if +other_str+ is not a string.
3296  *
3297  * "foo".casecmp(2) #=> nil
3298  * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp("\u{c4 d6 dc}") #=> nil
3299  */
3300 
3301 static VALUE
3302 rb_str_casecmp(VALUE str1, VALUE str2)
3303 {
3304  VALUE s = rb_check_string_type(str2);
3305  if (NIL_P(s)) {
3306  return Qnil;
3307  }
3308  return str_casecmp(str1, s);
3309 }
3310 
3311 static VALUE
3312 str_casecmp(VALUE str1, VALUE str2)
3313 {
3314  long len;
3315  rb_encoding *enc;
3316  char *p1, *p1end, *p2, *p2end;
3317 
3318  enc = rb_enc_compatible(str1, str2);
3319  if (!enc) {
3320  return Qnil;
3321  }
3322 
3323  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3324  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3325  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3326  while (p1 < p1end && p2 < p2end) {
3327  if (*p1 != *p2) {
3328  unsigned int c1 = TOUPPER(*p1 & 0xff);
3329  unsigned int c2 = TOUPPER(*p2 & 0xff);
3330  if (c1 != c2)
3331  return INT2FIX(c1 < c2 ? -1 : 1);
3332  }
3333  p1++;
3334  p2++;
3335  }
3336  }
3337  else {
3338  while (p1 < p1end && p2 < p2end) {
3339  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3340  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3341 
3342  if (0 <= c1 && 0 <= c2) {
3343  c1 = TOUPPER(c1);
3344  c2 = TOUPPER(c2);
3345  if (c1 != c2)
3346  return INT2FIX(c1 < c2 ? -1 : 1);
3347  }
3348  else {
3349  int r;
3350  l1 = rb_enc_mbclen(p1, p1end, enc);
3351  l2 = rb_enc_mbclen(p2, p2end, enc);
3352  len = l1 < l2 ? l1 : l2;
3353  r = memcmp(p1, p2, len);
3354  if (r != 0)
3355  return INT2FIX(r < 0 ? -1 : 1);
3356  if (l1 != l2)
3357  return INT2FIX(l1 < l2 ? -1 : 1);
3358  }
3359  p1 += l1;
3360  p2 += l2;
3361  }
3362  }
3363  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3364  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3365  return INT2FIX(-1);
3366 }
3367 
3368 /*
3369  * call-seq:
3370  * str.casecmp?(other_str) -> true, false, or nil
3371  *
3372  * Returns +true+ if +str+ and +other_str+ are equal after
3373  * Unicode case folding, +false+ if they are not equal.
3374  *
3375  * "aBcDeF".casecmp?("abcde") #=> false
3376  * "aBcDeF".casecmp?("abcdef") #=> true
3377  * "aBcDeF".casecmp?("abcdefg") #=> false
3378  * "abcdef".casecmp?("ABCDEF") #=> true
3379  * "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}") #=> true
3380  *
3381  * +nil+ is returned if the two strings have incompatible encodings,
3382  * or if +other_str+ is not a string.
3383  *
3384  * "foo".casecmp?(2) #=> nil
3385  * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp?("\u{c4 d6 dc}") #=> nil
3386  */
3387 
3388 static VALUE
3389 rb_str_casecmp_p(VALUE str1, VALUE str2)
3390 {
3391  VALUE s = rb_check_string_type(str2);
3392  if (NIL_P(s)) {
3393  return Qnil;
3394  }
3395  return str_casecmp_p(str1, s);
3396 }
3397 
3398 static VALUE
3399 str_casecmp_p(VALUE str1, VALUE str2)
3400 {
3401  rb_encoding *enc;
3402  VALUE folded_str1, folded_str2;
3403  VALUE fold_opt = sym_fold;
3404 
3405  enc = rb_enc_compatible(str1, str2);
3406  if (!enc) {
3407  return Qnil;
3408  }
3409 
3410  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3411  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3412 
3413  return rb_str_eql(folded_str1, folded_str2);
3414 }
3415 
3416 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3417 
3418 static long
3419 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3420 {
3421  const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
3422  long pos, str_len, sub_len, search_len;
3423  int single_byte = single_byte_optimizable(str);
3424  rb_encoding *enc;
3425 
3426  enc = rb_enc_check(str, sub);
3427  if (is_broken_string(sub)) return -1;
3428 
3429  str_ptr = RSTRING_PTR(str);
3430  str_ptr_end = RSTRING_END(str);
3431  str_len = RSTRING_LEN(str);
3432  sub_ptr = RSTRING_PTR(sub);
3433  sub_len = RSTRING_LEN(sub);
3434 
3435  if (str_len < sub_len) return -1;
3436 
3437  if (offset != 0) {
3438  long str_len_char, sub_len_char;
3439  str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3440  sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3441  if (offset < 0) {
3442  offset += str_len_char;
3443  if (offset < 0) return -1;
3444  }
3445  if (str_len_char - offset < sub_len_char) return -1;
3446  if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3447  str_ptr += offset;
3448  }
3449  if (sub_len == 0) return offset;
3450 
3451  /* need proceed one character at a time */
3452 
3453  search_start = str_ptr;
3454  search_len = RSTRING_LEN(str) - offset;
3455  for (;;) {
3456  const char *t;
3457  pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3458  if (pos < 0) return pos;
3459  t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3460  if (t == search_start + pos) break;
3461  search_len -= t - search_start;
3462  if (search_len <= 0) return -1;
3463  offset += t - search_start;
3464  search_start = t;
3465  }
3466  return pos + offset;
3467 }
3468 
3469 
3470 /*
3471  * call-seq:
3472  * str.index(substring [, offset]) -> integer or nil
3473  * str.index(regexp [, offset]) -> integer or nil
3474  *
3475  * Returns the index of the first occurrence of the given <i>substring</i> or
3476  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3477  * found. If the second parameter is present, it specifies the position in the
3478  * string to begin the search.
3479  *
3480  * "hello".index('e') #=> 1
3481  * "hello".index('lo') #=> 3
3482  * "hello".index('a') #=> nil
3483  * "hello".index(?e) #=> 1
3484  * "hello".index(/[aeiou]/, -3) #=> 4
3485  */
3486 
3487 static VALUE
3488 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3489 {
3490  VALUE sub;
3491  VALUE initpos;
3492  long pos;
3493 
3494  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3495  pos = NUM2LONG(initpos);
3496  }
3497  else {
3498  pos = 0;
3499  }
3500  if (pos < 0) {
3501  pos += str_strlen(str, NULL);
3502  if (pos < 0) {
3503  if (RB_TYPE_P(sub, T_REGEXP)) {
3505  }
3506  return Qnil;
3507  }
3508  }
3509 
3510  if (SPECIAL_CONST_P(sub)) goto generic;
3511  switch (BUILTIN_TYPE(sub)) {
3512  case T_REGEXP:
3513  if (pos > str_strlen(str, NULL))
3514  return Qnil;
3515  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3516  rb_enc_check(str, sub), single_byte_optimizable(str));
3517 
3518  pos = rb_reg_search(sub, str, pos, 0);
3519  pos = rb_str_sublen(str, pos);
3520  break;
3521 
3522  generic:
3523  default: {
3524  VALUE tmp;
3525 
3526  tmp = rb_check_string_type(sub);
3527  if (NIL_P(tmp)) {
3528  rb_raise(rb_eTypeError, "type mismatch: %s given",
3529  rb_obj_classname(sub));
3530  }
3531  sub = tmp;
3532  }
3533  /* fall through */
3534  case T_STRING:
3535  pos = rb_str_index(str, sub, pos);
3536  pos = rb_str_sublen(str, pos);
3537  break;
3538  }
3539 
3540  if (pos == -1) return Qnil;
3541  return LONG2NUM(pos);
3542 }
3543 
3544 #ifdef HAVE_MEMRCHR
3545 static long
3546 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3547 {
3548  char *hit, *adjusted;
3549  int c;
3550  long slen, searchlen;
3551  char *sbeg, *e, *t;
3552 
3553  slen = RSTRING_LEN(sub);
3554  if (slen == 0) return pos;
3555  sbeg = RSTRING_PTR(str);
3556  e = RSTRING_END(str);
3557  t = RSTRING_PTR(sub);
3558  c = *t & 0xff;
3559  searchlen = s - sbeg + 1;
3560 
3561  do {
3562  hit = memrchr(sbeg, c, searchlen);
3563  if (!hit) break;
3564  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3565  if (hit != adjusted) {
3566  searchlen = adjusted - sbeg;
3567  continue;
3568  }
3569  if (memcmp(hit, t, slen) == 0)
3570  return rb_str_sublen(str, hit - sbeg);
3571  searchlen = adjusted - sbeg;
3572  } while (searchlen > 0);
3573 
3574  return -1;
3575 }
3576 #else
3577 static long
3578 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3579 {
3580  long slen;
3581  char *sbeg, *e, *t;
3582 
3583  sbeg = RSTRING_PTR(str);
3584  e = RSTRING_END(str);
3585  t = RSTRING_PTR(sub);
3586  slen = RSTRING_LEN(sub);
3587 
3588  while (s) {
3589  if (memcmp(s, t, slen) == 0) {
3590  return pos;
3591  }
3592  if (pos == 0) break;
3593  pos--;
3594  s = rb_enc_prev_char(sbeg, s, e, enc);
3595  }
3596 
3597  return -1;
3598 }
3599 #endif
3600 
3601 static long
3602 rb_str_rindex(VALUE str, VALUE sub, long pos)
3603 {
3604  long len, slen;
3605  char *sbeg, *s;
3606  rb_encoding *enc;
3607  int singlebyte;
3608 
3609  enc = rb_enc_check(str, sub);
3610  if (is_broken_string(sub)) return -1;
3611  singlebyte = single_byte_optimizable(str);
3612  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3613  slen = str_strlen(sub, enc); /* rb_enc_check */
3614 
3615  /* substring longer than string */
3616  if (len < slen) return -1;
3617  if (len - pos < slen) pos = len - slen;
3618  if (len == 0) return pos;
3619 
3620  sbeg = RSTRING_PTR(str);
3621 
3622  if (pos == 0) {
3623  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3624  return 0;
3625  else
3626  return -1;
3627  }
3628 
3629  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3630  return str_rindex(str, sub, s, pos, enc);
3631 }
3632 
3633 
3634 /*
3635  * call-seq:
3636  * str.rindex(substring [, integer]) -> integer or nil
3637  * str.rindex(regexp [, integer]) -> integer or nil
3638  *
3639  * Returns the index of the last occurrence of the given <i>substring</i> or
3640  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3641  * found. If the second parameter is present, it specifies the position in the
3642  * string to end the search---characters beyond this point will not be
3643  * considered.
3644  *
3645  * "hello".rindex('e') #=> 1
3646  * "hello".rindex('l') #=> 3
3647  * "hello".rindex('a') #=> nil
3648  * "hello".rindex(?e) #=> 1
3649  * "hello".rindex(/[aeiou]/, -2) #=> 1
3650  */
3651 
3652 static VALUE
3653 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
3654 {
3655  VALUE sub;
3656  VALUE vpos;
3657  rb_encoding *enc = STR_ENC_GET(str);
3658  long pos, len = str_strlen(str, enc); /* str's enc */
3659 
3660  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3661  pos = NUM2LONG(vpos);
3662  if (pos < 0) {
3663  pos += len;
3664  if (pos < 0) {
3665  if (RB_TYPE_P(sub, T_REGEXP)) {
3667  }
3668  return Qnil;
3669  }
3670  }
3671  if (pos > len) pos = len;
3672  }
3673  else {
3674  pos = len;
3675  }
3676 
3677  if (SPECIAL_CONST_P(sub)) goto generic;
3678  switch (BUILTIN_TYPE(sub)) {
3679  case T_REGEXP:
3680  /* enc = rb_get_check(str, sub); */
3681  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3682  enc, single_byte_optimizable(str));
3683 
3684  pos = rb_reg_search(sub, str, pos, 1);
3685  pos = rb_str_sublen(str, pos);
3686  if (pos >= 0) return LONG2NUM(pos);
3687  break;
3688 
3689  generic:
3690  default: {
3691  VALUE tmp;
3692 
3693  tmp = rb_check_string_type(sub);
3694  if (NIL_P(tmp)) {
3695  rb_raise(rb_eTypeError, "type mismatch: %s given",
3696  rb_obj_classname(sub));
3697  }
3698  sub = tmp;
3699  }
3700  /* fall through */
3701  case T_STRING:
3702  pos = rb_str_rindex(str, sub, pos);
3703  if (pos >= 0) return LONG2NUM(pos);
3704  break;
3705  }
3706  return Qnil;
3707 }
3708 
3709 /*
3710  * call-seq:
3711  * str =~ obj -> integer or nil
3712  *
3713  * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
3714  * against <i>str</i>,and returns the position the match starts, or
3715  * <code>nil</code> if there is no match. Otherwise, invokes
3716  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
3717  * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
3718  *
3719  * Note: <code>str =~ regexp</code> is not the same as
3720  * <code>regexp =~ str</code>. Strings captured from named capture groups
3721  * are assigned to local variables only in the second case.
3722  *
3723  * "cat o' 9 tails" =~ /\d/ #=> 7
3724  * "cat o' 9 tails" =~ 9 #=> nil
3725  */
3726 
3727 static VALUE
3728 rb_str_match(VALUE x, VALUE y)
3729 {
3730  if (SPECIAL_CONST_P(y)) goto generic;
3731  switch (BUILTIN_TYPE(y)) {
3732  case T_STRING:
3733  rb_raise(rb_eTypeError, "type mismatch: String given");
3734 
3735  case T_REGEXP:
3736  return rb_reg_match(y, x);
3737 
3738  generic:
3739  default:
3740  return rb_funcall(y, idEqTilde, 1, x);
3741  }
3742 }
3743 
3744 
3745 static VALUE get_pat(VALUE);
3746 
3747 
3748 /*
3749  * call-seq:
3750  * str.match(pattern) -> matchdata or nil
3751  * str.match(pattern, pos) -> matchdata or nil
3752  *
3753  * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
3754  * then invokes its <code>match</code> method on <i>str</i>. If the second
3755  * parameter is present, it specifies the position in the string to begin the
3756  * search.
3757  *
3758  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
3759  * 'hello'.match('(.)\1')[0] #=> "ll"
3760  * 'hello'.match(/(.)\1/)[0] #=> "ll"
3761  * 'hello'.match(/(.)\1/, 3) #=> nil
3762  * 'hello'.match('xx') #=> nil
3763  *
3764  * If a block is given, invoke the block with MatchData if match succeed, so
3765  * that you can write
3766  *
3767  * str.match(pat) {|m| ...}
3768  *
3769  * instead of
3770  *
3771  * if m = str.match(pat)
3772  * ...
3773  * end
3774  *
3775  * The return value is a value from block execution in this case.
3776  */
3777 
3778 static VALUE
3779 rb_str_match_m(int argc, VALUE *argv, VALUE str)
3780 {
3781  VALUE re, result;
3782  if (argc < 1)
3783  rb_check_arity(argc, 1, 2);
3784  re = argv[0];
3785  argv[0] = str;
3786  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
3787  if (!NIL_P(result) && rb_block_given_p()) {
3788  return rb_yield(result);
3789  }
3790  return result;
3791 }
3792 
3793 /*
3794  * call-seq:
3795  * str.match?(pattern) -> true or false
3796  * str.match?(pattern, pos) -> true or false
3797  *
3798  * Converts _pattern_ to a +Regexp+ (if it isn't already one), then
3799  * returns a +true+ or +false+ indicates whether the regexp is
3800  * matched _str_ or not without updating <code>$~</code> and other
3801  * related variables. If the second parameter is present, it
3802  * specifies the position in the string to begin the search.
3803  *
3804  * "Ruby".match?(/R.../) #=> true
3805  * "Ruby".match?(/R.../, 1) #=> false
3806  * "Ruby".match?(/P.../) #=> false
3807  * $& #=> nil
3808  */
3809 
3810 static VALUE
3811 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
3812 {
3813  VALUE re;
3814  rb_check_arity(argc, 1, 2);
3815  re = get_pat(argv[0]);
3816  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
3817 }
3818 
3823 };
3824 
3825 static enum neighbor_char
3826 enc_succ_char(char *p, long len, rb_encoding *enc)
3827 {
3828  long i;
3829  int l;
3830 
3831  if (rb_enc_mbminlen(enc) > 1) {
3832  /* wchar, trivial case */
3833  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3834  if (!MBCLEN_CHARFOUND_P(r)) {
3835  return NEIGHBOR_NOT_CHAR;
3836  }
3837  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
3838  l = rb_enc_code_to_mbclen(c, enc);
3839  if (!l) return NEIGHBOR_NOT_CHAR;
3840  if (l != len) return NEIGHBOR_WRAPPED;
3841  rb_enc_mbcput(c, p, enc);
3842  r = rb_enc_precise_mbclen(p, p + len, enc);
3843  if (!MBCLEN_CHARFOUND_P(r)) {
3844  return NEIGHBOR_NOT_CHAR;
3845  }
3846  return NEIGHBOR_FOUND;
3847  }
3848  while (1) {
3849  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
3850  p[i] = '\0';
3851  if (i < 0)
3852  return NEIGHBOR_WRAPPED;
3853  ++((unsigned char*)p)[i];
3854  l = rb_enc_precise_mbclen(p, p+len, enc);
3855  if (MBCLEN_CHARFOUND_P(l)) {
3856  l = MBCLEN_CHARFOUND_LEN(l);
3857  if (l == len) {
3858  return NEIGHBOR_FOUND;
3859  }
3860  else {
3861  memset(p+l, 0xff, len-l);
3862  }
3863  }
3864  if (MBCLEN_INVALID_P(l) && i < len-1) {
3865  long len2;
3866  int l2;
3867  for (len2 = len-1; 0 < len2; len2--) {
3868  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3869  if (!MBCLEN_INVALID_P(l2))
3870  break;
3871  }
3872  memset(p+len2+1, 0xff, len-(len2+1));
3873  }
3874  }
3875 }
3876 
3877 static enum neighbor_char
3878 enc_pred_char(char *p, long len, rb_encoding *enc)
3879 {
3880  long i;
3881  int l;
3882  if (rb_enc_mbminlen(enc) > 1) {
3883  /* wchar, trivial case */
3884  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3885  if (!MBCLEN_CHARFOUND_P(r)) {
3886  return NEIGHBOR_NOT_CHAR;
3887  }
3888  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
3889  if (!c) return NEIGHBOR_NOT_CHAR;
3890  --c;
3891  l = rb_enc_code_to_mbclen(c, enc);
3892  if (!l) return NEIGHBOR_NOT_CHAR;
3893  if (l != len) return NEIGHBOR_WRAPPED;
3894  rb_enc_mbcput(c, p, enc);
3895  r = rb_enc_precise_mbclen(p, p + len, enc);
3896  if (!MBCLEN_CHARFOUND_P(r)) {
3897  return NEIGHBOR_NOT_CHAR;
3898  }
3899  return NEIGHBOR_FOUND;
3900  }
3901  while (1) {
3902  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
3903  p[i] = '\xff';
3904  if (i < 0)
3905  return NEIGHBOR_WRAPPED;
3906  --((unsigned char*)p)[i];
3907  l = rb_enc_precise_mbclen(p, p+len, enc);
3908  if (MBCLEN_CHARFOUND_P(l)) {
3909  l = MBCLEN_CHARFOUND_LEN(l);
3910  if (l == len) {
3911  return NEIGHBOR_FOUND;
3912  }
3913  else {
3914  memset(p+l, 0, len-l);
3915  }
3916  }
3917  if (MBCLEN_INVALID_P(l) && i < len-1) {
3918  long len2;
3919  int l2;
3920  for (len2 = len-1; 0 < len2; len2--) {
3921  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3922  if (!MBCLEN_INVALID_P(l2))
3923  break;
3924  }
3925  memset(p+len2+1, 0, len-(len2+1));
3926  }
3927  }
3928 }
3929 
3930 /*
3931  overwrite +p+ by succeeding letter in +enc+ and returns
3932  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
3933  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
3934  assuming each ranges are successive, and mbclen
3935  never change in each ranges.
3936  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
3937  character.
3938  */
3939 static enum neighbor_char
3940 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
3941 {
3942  enum neighbor_char ret;
3943  unsigned int c;
3944  int ctype;
3945  int range;
3946  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
3947 
3948  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
3949  int try;
3950  const int max_gaps = 1;
3951 
3952  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
3953  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
3954  ctype = ONIGENC_CTYPE_DIGIT;
3955  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
3956  ctype = ONIGENC_CTYPE_ALPHA;
3957  else
3958  return NEIGHBOR_NOT_CHAR;
3959 
3960  MEMCPY(save, p, char, len);
3961  for (try = 0; try <= max_gaps; ++try) {
3962  ret = enc_succ_char(p, len, enc);
3963  if (ret == NEIGHBOR_FOUND) {
3964  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
3965  if (rb_enc_isctype(c, ctype, enc))
3966  return NEIGHBOR_FOUND;
3967  }
3968  }
3969  MEMCPY(p, save, char, len);
3970  range = 1;
3971  while (1) {
3972  MEMCPY(save, p, char, len);
3973  ret = enc_pred_char(p, len, enc);
3974  if (ret == NEIGHBOR_FOUND) {
3975  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
3976  if (!rb_enc_isctype(c, ctype, enc)) {
3977  MEMCPY(p, save, char, len);
3978  break;
3979  }
3980  }
3981  else {
3982  MEMCPY(p, save, char, len);
3983  break;
3984  }
3985  range++;
3986  }
3987  if (range == 1) {
3988  return NEIGHBOR_NOT_CHAR;
3989  }
3990 
3991  if (ctype != ONIGENC_CTYPE_DIGIT) {
3992  MEMCPY(carry, p, char, len);
3993  return NEIGHBOR_WRAPPED;
3994  }
3995 
3996  MEMCPY(carry, p, char, len);
3997  enc_succ_char(carry, len, enc);
3998  return NEIGHBOR_WRAPPED;
3999 }
4000 
4001 
4002 static VALUE str_succ(VALUE str);
4003 
4004 /*
4005  * call-seq:
4006  * str.succ -> new_str
4007  * str.next -> new_str
4008  *
4009  * Returns the successor to <i>str</i>. The successor is calculated by
4010  * incrementing characters starting from the rightmost alphanumeric (or
4011  * the rightmost character if there are no alphanumerics) in the
4012  * string. Incrementing a digit always results in another digit, and
4013  * incrementing a letter results in another letter of the same case.
4014  * Incrementing nonalphanumerics uses the underlying character set's
4015  * collating sequence.
4016  *
4017  * If the increment generates a ``carry,'' the character to the left of
4018  * it is incremented. This process repeats until there is no carry,
4019  * adding an additional character if necessary.
4020  *
4021  * "abcd".succ #=> "abce"
4022  * "THX1138".succ #=> "THX1139"
4023  * "<<koala>>".succ #=> "<<koalb>>"
4024  * "1999zzz".succ #=> "2000aaa"
4025  * "ZZZ9999".succ #=> "AAAA0000"
4026  * "***".succ #=> "**+"
4027  */
4028 
4029 VALUE
4031 {
4032  VALUE str;
4033  str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
4034  rb_enc_cr_str_copy_for_substr(str, orig);
4035  OBJ_INFECT(str, orig);
4036  return str_succ(str);
4037 }
4038 
4039 static VALUE
4040 str_succ(VALUE str)
4041 {
4042  rb_encoding *enc;
4043  char *sbeg, *s, *e, *last_alnum = 0;
4044  int c = -1;
4045  long l, slen;
4046  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4047  long carry_pos = 0, carry_len = 1;
4048  enum neighbor_char neighbor = NEIGHBOR_FOUND;
4049 
4050  slen = RSTRING_LEN(str);
4051  if (slen == 0) return str;
4052 
4053  enc = STR_ENC_GET(str);
4054  sbeg = RSTRING_PTR(str);
4055  s = e = sbeg + slen;
4056 
4057  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4058  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4059  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4060  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4061  s = last_alnum;
4062  break;
4063  }
4064  }
4065  l = rb_enc_precise_mbclen(s, e, enc);
4066  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4068  neighbor = enc_succ_alnum_char(s, l, enc, carry);
4069  switch (neighbor) {
4070  case NEIGHBOR_NOT_CHAR:
4071  continue;
4072  case NEIGHBOR_FOUND:
4073  return str;
4074  case NEIGHBOR_WRAPPED:
4075  last_alnum = s;
4076  break;
4077  }
4078  c = 1;
4079  carry_pos = s - sbeg;
4080  carry_len = l;
4081  }
4082  if (c == -1) { /* str contains no alnum */
4083  s = e;
4084  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4085  enum neighbor_char neighbor;
4086  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4087  l = rb_enc_precise_mbclen(s, e, enc);
4088  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4090  MEMCPY(tmp, s, char, l);
4091  neighbor = enc_succ_char(tmp, l, enc);
4092  switch (neighbor) {
4093  case NEIGHBOR_FOUND:
4094  MEMCPY(s, tmp, char, l);
4095  return str;
4096  break;
4097  case NEIGHBOR_WRAPPED:
4098  MEMCPY(s, tmp, char, l);
4099  break;
4100  case NEIGHBOR_NOT_CHAR:
4101  break;
4102  }
4103  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4104  /* wrapped to \0...\0. search next valid char. */
4105  enc_succ_char(s, l, enc);
4106  }
4107  if (!rb_enc_asciicompat(enc)) {
4108  MEMCPY(carry, s, char, l);
4109  carry_len = l;
4110  }
4111  carry_pos = s - sbeg;
4112  }
4114  }
4115  RESIZE_CAPA(str, slen + carry_len);
4116  sbeg = RSTRING_PTR(str);
4117  s = sbeg + carry_pos;
4118  memmove(s + carry_len, s, slen - carry_pos);
4119  memmove(s, carry, carry_len);
4120  slen += carry_len;
4121  STR_SET_LEN(str, slen);
4122  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4123  rb_enc_str_coderange(str);
4124  return str;
4125 }
4126 
4127 
4128 /*
4129  * call-seq:
4130  * str.succ! -> str
4131  * str.next! -> str
4132  *
4133  * Equivalent to <code>String#succ</code>, but modifies the receiver in
4134  * place.
4135  */
4136 
4137 static VALUE
4138 rb_str_succ_bang(VALUE str)
4139 {
4140  rb_str_modify(str);
4141  str_succ(str);
4142  return str;
4143 }
4144 
4145 static int
4146 all_digits_p(const char *s, long len)
4147 {
4148  while (len-- > 0) {
4149  if (!ISDIGIT(*s)) return 0;
4150  s++;
4151  }
4152  return 1;
4153 }
4154 
4155 static VALUE str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE);
4156 
4157 static int
4158 str_upto_i(VALUE str, VALUE arg)
4159 {
4160  rb_yield(str);
4161  return 0;
4162 }
4163 
4164 /*
4165  * call-seq:
4166  * str.upto(other_str, exclusive=false) {|s| block } -> str
4167  * str.upto(other_str, exclusive=false) -> an_enumerator
4168  *
4169  * Iterates through successive values, starting at <i>str</i> and
4170  * ending at <i>other_str</i> inclusive, passing each value in turn to
4171  * the block. The <code>String#succ</code> method is used to generate
4172  * each value. If optional second argument exclusive is omitted or is false,
4173  * the last value will be included; otherwise it will be excluded.
4174  *
4175  * If no block is given, an enumerator is returned instead.
4176  *
4177  * "a8".upto("b6") {|s| print s, ' ' }
4178  * for s in "a8".."b6"
4179  * print s, ' '
4180  * end
4181  *
4182  * <em>produces:</em>
4183  *
4184  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4185  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4186  *
4187  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
4188  * both are recognized as decimal numbers. In addition, the width of
4189  * string (e.g. leading zeros) is handled appropriately.
4190  *
4191  * "9".upto("11").to_a #=> ["9", "10", "11"]
4192  * "25".upto("5").to_a #=> []
4193  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
4194  */
4195 
4196 static VALUE
4197 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4198 {
4199  VALUE end, exclusive;
4200 
4201  rb_scan_args(argc, argv, "11", &end, &exclusive);
4202  RETURN_ENUMERATOR(beg, argc, argv);
4203  return str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4204 }
4205 
4206 static VALUE
4207 str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4208 {
4209  VALUE current, after_end;
4210  ID succ;
4211  int n, ascii;
4212  rb_encoding *enc;
4213 
4214  CONST_ID(succ, "succ");
4215  StringValue(end);
4216  enc = rb_enc_check(beg, end);
4217  ascii = (is_ascii_string(beg) && is_ascii_string(end));
4218  /* single character */
4219  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4220  char c = RSTRING_PTR(beg)[0];
4221  char e = RSTRING_PTR(end)[0];
4222 
4223  if (c > e || (excl && c == e)) return beg;
4224  for (;;) {
4225  if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4226  if (!excl && c == e) break;
4227  c++;
4228  if (excl && c == e) break;
4229  }
4230  return beg;
4231  }
4232  /* both edges are all digits */
4233  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4234  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4235  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4236  VALUE b, e;
4237  int width;
4238 
4239  width = RSTRING_LENINT(beg);
4240  b = rb_str_to_inum(beg, 10, FALSE);
4241  e = rb_str_to_inum(end, 10, FALSE);
4242  if (FIXNUM_P(b) && FIXNUM_P(e)) {
4243  long bi = FIX2LONG(b);
4244  long ei = FIX2LONG(e);
4245  rb_encoding *usascii = rb_usascii_encoding();
4246 
4247  while (bi <= ei) {
4248  if (excl && bi == ei) break;
4249  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4250  bi++;
4251  }
4252  }
4253  else {
4254  ID op = excl ? '<' : idLE;
4255  VALUE args[2], fmt = rb_fstring_cstr("%.*d");
4256 
4257  args[0] = INT2FIX(width);
4258  while (rb_funcall(b, op, 1, e)) {
4259  args[1] = b;
4260  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4261  b = rb_funcallv(b, succ, 0, 0);
4262  }
4263  }
4264  return beg;
4265  }
4266  /* normal case */
4267  n = rb_str_cmp(beg, end);
4268  if (n > 0 || (excl && n == 0)) return beg;
4269 
4270  after_end = rb_funcallv(end, succ, 0, 0);
4271  current = rb_str_dup(beg);
4272  while (!rb_str_equal(current, after_end)) {
4273  VALUE next = Qnil;
4274  if (excl || !rb_str_equal(current, end))
4275  next = rb_funcallv(current, succ, 0, 0);
4276  if ((*each)(current, arg)) break;
4277  if (NIL_P(next)) break;
4278  current = next;
4279  StringValue(current);
4280  if (excl && rb_str_equal(current, end)) break;
4281  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4282  break;
4283  }
4284 
4285  return beg;
4286 }
4287 
4288 static int
4289 include_range_i(VALUE str, VALUE arg)
4290 {
4291  VALUE *argp = (VALUE *)arg;
4292  if (!rb_equal(str, *argp)) return 0;
4293  *argp = Qnil;
4294  return 1;
4295 }
4296 
4297 VALUE
4298 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4299 {
4300  beg = rb_str_new_frozen(beg);
4301  StringValue(end);
4302  end = rb_str_new_frozen(end);
4303  if (NIL_P(val)) return Qfalse;
4304  val = rb_check_string_type(val);
4305  if (NIL_P(val)) return Qfalse;
4306  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4309  const char *bp = RSTRING_PTR(beg);
4310  const char *ep = RSTRING_PTR(end);
4311  const char *vp = RSTRING_PTR(val);
4312  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4313  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4314  return Qfalse;
4315  else {
4316  char b = *bp;
4317  char e = *ep;
4318  char v = *vp;
4319 
4320  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4321  if (b <= v && v < e) return Qtrue;
4322  if (!RTEST(exclusive) && v == e) return Qtrue;
4323  return Qfalse;
4324  }
4325  }
4326  }
4327 #if 0
4328  /* both edges are all digits */
4329  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4330  all_digits_p(bp, RSTRING_LEN(beg)) &&
4331  all_digits_p(ep, RSTRING_LEN(end))) {
4332  /* TODO */
4333  }
4334 #endif
4335  }
4336  str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4337 
4338  return NIL_P(val) ? Qtrue : Qfalse;
4339 }
4340 
4341 static VALUE
4342 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4343 {
4344  if (rb_reg_search(re, str, 0, 0) >= 0) {
4345  VALUE match = rb_backref_get();
4346  int nth = rb_reg_backref_number(match, backref);
4347  return rb_reg_nth_match(nth, match);
4348  }
4349  return Qnil;
4350 }
4351 
4352 static VALUE
4353 rb_str_aref(VALUE str, VALUE indx)
4354 {
4355  long idx;
4356 
4357  if (FIXNUM_P(indx)) {
4358  idx = FIX2LONG(indx);
4359  }
4360  else if (RB_TYPE_P(indx, T_REGEXP)) {
4361  return rb_str_subpat(str, indx, INT2FIX(0));
4362  }
4363  else if (RB_TYPE_P(indx, T_STRING)) {
4364  if (rb_str_index(str, indx, 0) != -1)
4365  return rb_str_dup(indx);
4366  return Qnil;
4367  }
4368  else {
4369  /* check if indx is Range */
4370  long beg, len = str_strlen(str, NULL);
4371  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4372  case Qfalse:
4373  break;
4374  case Qnil:
4375  return Qnil;
4376  default:
4377  return rb_str_substr(str, beg, len);
4378  }
4379  idx = NUM2LONG(indx);
4380  }
4381 
4382  return str_substr(str, idx, 1, FALSE);
4383 }
4384 
4385 
4386 /*
4387  * call-seq:
4388  * str[index] -> new_str or nil
4389  * str[start, length] -> new_str or nil
4390  * str[range] -> new_str or nil
4391  * str[regexp] -> new_str or nil
4392  * str[regexp, capture] -> new_str or nil
4393  * str[match_str] -> new_str or nil
4394  * str.slice(index) -> new_str or nil
4395  * str.slice(start, length) -> new_str or nil
4396  * str.slice(range) -> new_str or nil
4397  * str.slice(regexp) -> new_str or nil
4398  * str.slice(regexp, capture) -> new_str or nil
4399  * str.slice(match_str) -> new_str or nil
4400  *
4401  * Element Reference --- If passed a single +index+, returns a substring of
4402  * one character at that index. If passed a +start+ index and a +length+,
4403  * returns a substring containing +length+ characters starting at the
4404  * +start+ index. If passed a +range+, its beginning and end are interpreted as
4405  * offsets delimiting the substring to be returned.
4406  *
4407  * In these three cases, if an index is negative, it is counted from the end
4408  * of the string. For the +start+ and +range+ cases the starting index
4409  * is just before a character and an index matching the string's size.
4410  * Additionally, an empty string is returned when the starting index for a
4411  * character range is at the end of the string.
4412  *
4413  * Returns +nil+ if the initial index falls outside the string or the length
4414  * is negative.
4415  *
4416  * If a +Regexp+ is supplied, the matching portion of the string is
4417  * returned. If a +capture+ follows the regular expression, which may be a
4418  * capture group index or name, follows the regular expression that component
4419  * of the MatchData is returned instead.
4420  *
4421  * If a +match_str+ is given, that string is returned if it occurs in
4422  * the string.
4423  *
4424  * Returns +nil+ if the regular expression does not match or the match string
4425  * cannot be found.
4426  *
4427  * a = "hello there"
4428  *
4429  * a[1] #=> "e"
4430  * a[2, 3] #=> "llo"
4431  * a[2..3] #=> "ll"
4432  *
4433  * a[-3, 2] #=> "er"
4434  * a[7..-2] #=> "her"
4435  * a[-4..-2] #=> "her"
4436  * a[-2..-4] #=> ""
4437  *
4438  * a[11, 0] #=> ""
4439  * a[11] #=> nil
4440  * a[12, 0] #=> nil
4441  * a[12..-1] #=> nil
4442  *
4443  * a[/[aeiou](.)\1/] #=> "ell"
4444  * a[/[aeiou](.)\1/, 0] #=> "ell"
4445  * a[/[aeiou](.)\1/, 1] #=> "l"
4446  * a[/[aeiou](.)\1/, 2] #=> nil
4447  *
4448  * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
4449  * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e"
4450  *
4451  * a["lo"] #=> "lo"
4452  * a["bye"] #=> nil
4453  */
4454 
4455 static VALUE
4456 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
4457 {
4458  if (argc == 2) {
4459  if (RB_TYPE_P(argv[0], T_REGEXP)) {
4460  return rb_str_subpat(str, argv[0], argv[1]);
4461  }
4462  else {
4463  long beg = NUM2LONG(argv[0]);
4464  long len = NUM2LONG(argv[1]);
4465  return rb_str_substr(str, beg, len);
4466  }
4467  }
4468  rb_check_arity(argc, 1, 2);
4469  return rb_str_aref(str, argv[0]);
4470 }
4471 
4472 VALUE
4473 rb_str_drop_bytes(VALUE str, long len)
4474 {
4475  char *ptr = RSTRING_PTR(str);
4476  long olen = RSTRING_LEN(str), nlen;
4477 
4478  str_modifiable(str);
4479  if (len > olen) len = olen;
4480  nlen = olen - len;
4481  if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4482  char *oldptr = ptr;
4483  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4484  STR_SET_EMBED(str);
4485  STR_SET_EMBED_LEN(str, nlen);
4486  ptr = RSTRING(str)->as.ary;
4487  memmove(ptr, oldptr + len, nlen);
4488  if (fl == STR_NOEMBED) xfree(oldptr);
4489  }
4490  else {
4491  if (!STR_SHARED_P(str)) rb_str_new_frozen(str);
4492  ptr = RSTRING(str)->as.heap.ptr += len;
4493  RSTRING(str)->as.heap.len = nlen;
4494  }
4495  ptr[nlen] = 0;
4496  ENC_CODERANGE_CLEAR(str);
4497  return str;
4498 }
4499 
4500 static void
4501 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4502 {
4503  char *sptr;
4504  long slen, vlen = RSTRING_LEN(val);
4505  int cr;
4506 
4507  if (beg == 0 && vlen == 0) {
4508  rb_str_drop_bytes(str, len);
4509  OBJ_INFECT(str, val);
4510  return;
4511  }
4512 
4513  str_modify_keep_cr(str);
4514  RSTRING_GETMEM(str, sptr, slen);
4515  if (len < vlen) {
4516  /* expand string */
4517  RESIZE_CAPA(str, slen + vlen - len);
4518  sptr = RSTRING_PTR(str);
4519  }
4520 
4521  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
4522  cr = rb_enc_str_coderange(val);
4523  else
4524  cr = ENC_CODERANGE_UNKNOWN;
4525 
4526  if (vlen != len) {
4527  memmove(sptr + beg + vlen,
4528  sptr + beg + len,
4529  slen - (beg + len));
4530  }
4531  if (vlen < beg && len < 0) {
4532  MEMZERO(sptr + slen, char, -len);
4533  }
4534  if (vlen > 0) {
4535  memmove(sptr + beg, RSTRING_PTR(val), vlen);
4536  }
4537  slen += vlen - len;
4538  STR_SET_LEN(str, slen);
4539  TERM_FILL(&sptr[slen], TERM_LEN(str));
4540  OBJ_INFECT(str, val);
4541  ENC_CODERANGE_SET(str, cr);
4542 }
4543 
4544 void
4545 rb_str_update(VALUE str, long beg, long len, VALUE val)
4546 {
4547  long slen;
4548  char *p, *e;
4549  rb_encoding *enc;
4550  int singlebyte = single_byte_optimizable(str);
4551  int cr;
4552 
4553  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4554 
4555  StringValue(val);
4556  enc = rb_enc_check(str, val);
4557  slen = str_strlen(str, enc); /* rb_enc_check */
4558 
4559  if (slen < beg) {
4560  out_of_range:
4561  rb_raise(rb_eIndexError, "index %ld out of string", beg);
4562  }
4563  if (beg < 0) {
4564  if (beg + slen < 0) {
4565  goto out_of_range;
4566  }
4567  beg += slen;
4568  }
4569  assert(beg >= 0);
4570  assert(beg <= slen);
4571  if (len > slen - beg) {
4572  len = slen - beg;
4573  }
4574  str_modify_keep_cr(str);
4575  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4576  if (!p) p = RSTRING_END(str);
4577  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4578  if (!e) e = RSTRING_END(str);
4579  /* error check */
4580  beg = p - RSTRING_PTR(str); /* physical position */
4581  len = e - p; /* physical length */
4582  rb_str_splice_0(str, beg, len, val);
4583  rb_enc_associate(str, enc);
4585  if (cr != ENC_CODERANGE_BROKEN)
4586  ENC_CODERANGE_SET(str, cr);
4587 }
4588 
4589 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4590 
4591 static void
4592 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
4593 {
4594  int nth;
4595  VALUE match;
4596  long start, end, len;
4597  rb_encoding *enc;
4598  struct re_registers *regs;
4599 
4600  if (rb_reg_search(re, str, 0, 0) < 0) {
4601  rb_raise(rb_eIndexError, "regexp not matched");
4602  }
4603  match = rb_backref_get();
4604  nth = rb_reg_backref_number(match, backref);
4605  regs = RMATCH_REGS(match);
4606  if (nth >= regs->num_regs) {
4607  out_of_range:
4608  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4609  }
4610  if (nth < 0) {
4611  if (-nth >= regs->num_regs) {
4612  goto out_of_range;
4613  }
4614  nth += regs->num_regs;
4615  }
4616 
4617  start = BEG(nth);
4618  if (start == -1) {
4619  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4620  }
4621  end = END(nth);
4622  len = end - start;
4623  StringValue(val);
4624  enc = rb_enc_check_str(str, val);
4625  rb_str_splice_0(str, start, len, val);
4626  rb_enc_associate(str, enc);
4627 }
4628 
4629 static VALUE
4630 rb_str_aset(VALUE str, VALUE indx, VALUE val)
4631 {
4632  long idx, beg;
4633 
4634  if (FIXNUM_P(indx)) {
4635  idx = FIX2LONG(indx);
4636  num_index:
4637  rb_str_splice(str, idx, 1, val);
4638  return val;
4639  }
4640 
4641  if (SPECIAL_CONST_P(indx)) goto generic;
4642  switch (TYPE(indx)) {
4643  case T_REGEXP:
4644  rb_str_subpat_set(str, indx, INT2FIX(0), val);
4645  return val;
4646 
4647  case T_STRING:
4648  beg = rb_str_index(str, indx, 0);
4649  if (beg < 0) {
4650  rb_raise(rb_eIndexError, "string not matched");
4651  }
4652  beg = rb_str_sublen(str, beg);
4653  rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4654  return val;
4655 
4656  generic:
4657  default:
4658  /* check if indx is Range */
4659  {
4660  long beg, len;
4661  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4662  rb_str_splice(str, beg, len, val);
4663  return val;
4664  }
4665  }
4666  idx = NUM2LONG(indx);
4667  goto num_index;
4668  }
4669 }
4670 
4671 /*
4672  * call-seq:
4673  * str[integer] = new_str
4674  * str[integer, integer] = new_str
4675  * str[range] = aString
4676  * str[regexp] = new_str
4677  * str[regexp, integer] = new_str
4678  * str[regexp, name] = new_str
4679  * str[other_str] = new_str
4680  *
4681  * Element Assignment---Replaces some or all of the content of <i>str</i>. The
4682  * portion of the string affected is determined using the same criteria as
4683  * <code>String#[]</code>. If the replacement string is not the same length as
4684  * the text it is replacing, the string will be adjusted accordingly. If the
4685  * regular expression or string is used as the index doesn't match a position
4686  * in the string, <code>IndexError</code> is raised. If the regular expression
4687  * form is used, the optional second <code>Integer</code> allows you to specify
4688  * which portion of the match to replace (effectively using the
4689  * <code>MatchData</code> indexing rules. The forms that take an
4690  * <code>Integer</code> will raise an <code>IndexError</code> if the value is
4691  * out of range; the <code>Range</code> form will raise a
4692  * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
4693  * will raise an <code>IndexError</code> on negative match.
4694  */
4695 
4696 static VALUE
4697 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
4698 {
4699  if (argc == 3) {
4700  if (RB_TYPE_P(argv[0], T_REGEXP)) {
4701  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
4702  }
4703  else {
4704  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
4705  }
4706  return argv[2];
4707  }
4708  rb_check_arity(argc, 2, 3);
4709  return rb_str_aset(str, argv[0], argv[1]);
4710 }
4711 
4712 /*
4713  * call-seq:
4714  * str.insert(index, other_str) -> str
4715  *
4716  * Inserts <i>other_str</i> before the character at the given
4717  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
4718  * end of the string, and insert <em>after</em> the given character.
4719  * The intent is insert <i>aString</i> so that it starts at the given
4720  * <i>index</i>.
4721  *
4722  * "abcd".insert(0, 'X') #=> "Xabcd"
4723  * "abcd".insert(3, 'X') #=> "abcXd"
4724  * "abcd".insert(4, 'X') #=> "abcdX"
4725  * "abcd".insert(-3, 'X') #=> "abXcd"
4726  * "abcd".insert(-1, 'X') #=> "abcdX"
4727  */
4728 
4729 static VALUE
4730 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
4731 {
4732  long pos = NUM2LONG(idx);
4733 
4734  if (pos == -1) {
4735  return rb_str_append(str, str2);
4736  }
4737  else if (pos < 0) {
4738  pos++;
4739  }
4740  rb_str_splice(str, pos, 0, str2);
4741  return str;
4742 }
4743 
4744 
4745 /*
4746  * call-seq:
4747  * str.slice!(integer) -> new_str or nil
4748  * str.slice!(integer, integer) -> new_str or nil
4749  * str.slice!(range) -> new_str or nil
4750  * str.slice!(regexp) -> new_str or nil
4751  * str.slice!(other_str) -> new_str or nil
4752  *
4753  * Deletes the specified portion from <i>str</i>, and returns the portion
4754  * deleted.
4755  *
4756  * string = "this is a string"
4757  * string.slice!(2) #=> "i"
4758  * string.slice!(3..6) #=> " is "
4759  * string.slice!(/s.*t/) #=> "sa st"
4760  * string.slice!("r") #=> "r"
4761  * string #=> "thing"
4762  */
4763 
4764 static VALUE
4765 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
4766 {
4767  VALUE result;
4768  VALUE buf[3];
4769  int i;
4770 
4771  rb_check_arity(argc, 1, 2);
4772  for (i=0; i<argc; i++) {
4773  buf[i] = argv[i];
4774  }
4775  str_modify_keep_cr(str);
4776  result = rb_str_aref_m(argc, buf, str);
4777  if (!NIL_P(result)) {
4778  buf[i] = rb_str_new(0,0);
4779  rb_str_aset_m(argc+1, buf, str);
4780  }
4781  return result;
4782 }
4783 
4784 static VALUE
4785 get_pat(VALUE pat)
4786 {
4787  VALUE val;
4788 
4789  if (SPECIAL_CONST_P(pat)) goto to_string;
4790  switch (BUILTIN_TYPE(pat)) {
4791  case T_REGEXP:
4792  return pat;
4793 
4794  case T_STRING:
4795  break;
4796 
4797  default:
4798  to_string:
4799  val = rb_check_string_type(pat);
4800  if (NIL_P(val)) {
4801  Check_Type(pat, T_REGEXP);
4802  }
4803  pat = val;
4804  }
4805 
4806  return rb_reg_regcomp(pat);
4807 }
4808 
4809 static VALUE
4810 get_pat_quoted(VALUE pat, int check)
4811 {
4812  VALUE val;
4813 
4814  if (SPECIAL_CONST_P(pat)) goto to_string;
4815  switch (BUILTIN_TYPE(pat)) {
4816  case T_REGEXP:
4817  return pat;
4818 
4819  case T_STRING:
4820  break;
4821 
4822  default:
4823  to_string:
4824  val = rb_check_string_type(pat);
4825  if (NIL_P(val)) {
4826  Check_Type(pat, T_REGEXP);
4827  }
4828  pat = val;
4829  }
4830  if (check && is_broken_string(pat)) {
4832  }
4833  return pat;
4834 }
4835 
4836 static long
4837 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
4838 {
4839  if (BUILTIN_TYPE(pat) == T_STRING) {
4840  pos = rb_strseq_index(str, pat, pos, 1);
4841  if (set_backref_str) {
4842  if (pos >= 0) {
4843  VALUE match;
4844  str = rb_str_new_frozen(str);
4845  rb_backref_set_string(str, pos, RSTRING_LEN(pat));
4846  match = rb_backref_get();
4847  OBJ_INFECT(match, pat);
4848  }
4849  else {
4851  }
4852  }
4853  return pos;
4854  }
4855  else {
4856  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
4857  }
4858 }
4859 
4860 
4861 /*
4862  * call-seq:
4863  * str.sub!(pattern, replacement) -> str or nil
4864  * str.sub!(pattern) {|match| block } -> str or nil
4865  *
4866  * Performs the same substitution as String#sub in-place.
4867  *
4868  * Returns +str+ if a substitution was performed or +nil+ if no substitution
4869  * was performed.
4870  */
4871 
4872 static VALUE
4873 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
4874 {
4875  VALUE pat, repl, hash = Qnil;
4876  int iter = 0;
4877  int tainted = 0;
4878  long plen;
4879  int min_arity = rb_block_given_p() ? 1 : 2;
4880  long beg;
4881 
4882  rb_check_arity(argc, min_arity, 2);
4883  if (argc == 1) {
4884  iter = 1;
4885  }
4886  else {
4887  repl = argv[1];
4888  hash = rb_check_hash_type(argv[1]);
4889  if (NIL_P(hash)) {
4890  StringValue(repl);
4891  }
4892  tainted = OBJ_TAINTED_RAW(repl);
4893  }
4894 
4895  pat = get_pat_quoted(argv[0], 1);
4896 
4897  str_modifiable(str);
4898  beg = rb_pat_search(pat, str, 0, 1);
4899  if (beg >= 0) {
4900  rb_encoding *enc;
4901  int cr = ENC_CODERANGE(str);
4902  long beg0, end0;
4903  VALUE match, match0 = Qnil;
4904  struct re_registers *regs;
4905  char *p, *rp;
4906  long len, rlen;
4907 
4908  match = rb_backref_get();
4909  regs = RMATCH_REGS(match);
4910  if (RB_TYPE_P(pat, T_STRING)) {
4911  beg0 = beg;
4912  end0 = beg0 + RSTRING_LEN(pat);
4913  match0 = pat;
4914  }
4915  else {
4916  beg0 = BEG(0);
4917  end0 = END(0);
4918  if (iter) match0 = rb_reg_nth_match(0, match);
4919  }
4920 
4921  if (iter || !NIL_P(hash)) {
4922  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
4923 
4924  if (iter) {
4925  repl = rb_obj_as_string(rb_yield(match0));
4926  }
4927  else {
4928  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
4929  repl = rb_obj_as_string(repl);
4930  }
4931  str_mod_check(str, p, len);
4932  rb_check_frozen(str);
4933  }
4934  else {
4935  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
4936  }
4937 
4938  enc = rb_enc_compatible(str, repl);
4939  if (!enc) {
4940  rb_encoding *str_enc = STR_ENC_GET(str);
4941  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
4942  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
4943  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
4944  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
4945  rb_enc_name(str_enc),
4946  rb_enc_name(STR_ENC_GET(repl)));
4947  }
4948  enc = STR_ENC_GET(repl);
4949  }
4950  rb_str_modify(str);
4951  rb_enc_associate(str, enc);
4952  tainted |= OBJ_TAINTED_RAW(repl);
4953  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
4954  int cr2 = ENC_CODERANGE(repl);
4955  if (cr2 == ENC_CODERANGE_BROKEN ||
4956  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
4957  cr = ENC_CODERANGE_UNKNOWN;
4958  else
4959  cr = cr2;
4960  }
4961  plen = end0 - beg0;
4962  rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
4963  len = RSTRING_LEN(str);
4964  if (rlen > plen) {
4965  RESIZE_CAPA(str, len + rlen - plen);
4966  }
4967  p = RSTRING_PTR(str);
4968  if (rlen != plen) {
4969  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
4970  }
4971  memcpy(p + beg0, rp, rlen);
4972  len += rlen - plen;
4973  STR_SET_LEN(str, len);
4974  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
4975  ENC_CODERANGE_SET(str, cr);
4976  FL_SET_RAW(str, tainted);
4977 
4978  return str;
4979  }
4980  return Qnil;
4981 }
4982 
4983 
4984 /*
4985  * call-seq:
4986  * str.sub(pattern, replacement) -> new_str
4987  * str.sub(pattern, hash) -> new_str
4988  * str.sub(pattern) {|match| block } -> new_str
4989  *
4990  * Returns a copy of +str+ with the _first_ occurrence of +pattern+
4991  * replaced by the second argument. The +pattern+ is typically a Regexp; if
4992  * given as a String, any regular expression metacharacters it contains will
4993  * be interpreted literally, e.g. <code>'\\\d'</code> will match a backslash
4994  * followed by 'd', instead of a digit.
4995  *
4996  * If +replacement+ is a String it will be substituted for the matched text.
4997  * It may contain back-references to the pattern's capture groups of the form
4998  * <code>"\\d"</code>, where <i>d</i> is a group number, or
4999  * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
5000  * double-quoted string, both back-references must be preceded by an
5001  * additional backslash. However, within +replacement+ the special match
5002  * variables, such as <code>$&</code>, will not refer to the current match.
5003  * If +replacement+ is a String that looks like a pattern's capture group but
5004  * is actually not a pattern capture group e.g. <code>"\\'"</code>, then it
5005  * will have to be preceded by two backslashes like so <code>"\\\\'"</code>.
5006  *
5007  * If the second argument is a Hash, and the matched text is one of its keys,
5008  * the corresponding value is the replacement string.
5009  *
5010  * In the block form, the current match string is passed in as a parameter,
5011  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5012  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5013  * returned by the block will be substituted for the match on each call.
5014  *
5015  * The result inherits any tainting in the original string or any supplied
5016  * replacement string.
5017  *
5018  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
5019  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
5020  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
5021  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
5022  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
5023  * #=> "Is /bin/bash your preferred shell?"
5024  */
5025 
5026 static VALUE
5027 rb_str_sub(int argc, VALUE *argv, VALUE str)
5028 {
5029  str = rb_str_dup(str);
5030  rb_str_sub_bang(argc, argv, str);
5031  return str;
5032 }
5033 
5034 static VALUE
5035 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5036 {
5037  VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5038  struct re_registers *regs;
5039  long beg, beg0, end0;
5040  long offset, blen, slen, len, last;
5041  enum {STR, ITER, MAP} mode = STR;
5042  char *sp, *cp;
5043  int tainted = 0;
5044  int need_backref = -1;
5045  rb_encoding *str_enc;
5046 
5047  switch (argc) {
5048  case 1:
5049  RETURN_ENUMERATOR(str, argc, argv);
5050  mode = ITER;
5051  break;
5052  case 2:
5053  repl = argv[1];
5054  hash = rb_check_hash_type(argv[1]);
5055  if (NIL_P(hash)) {
5056  StringValue(repl);
5057  }
5058  else {
5059  mode = MAP;
5060  }
5061  tainted = OBJ_TAINTED_RAW(repl);
5062  break;
5063  default:
5064  rb_check_arity(argc, 1, 2);
5065  }
5066 
5067  pat = get_pat_quoted(argv[0], 1);
5068  beg = rb_pat_search(pat, str, 0, need_backref);
5069  if (beg < 0) {
5070  if (bang) return Qnil; /* no match, no substitution */
5071  return rb_str_dup(str);
5072  }
5073 
5074  offset = 0;
5075  blen = RSTRING_LEN(str) + 30; /* len + margin */
5076  dest = rb_str_buf_new(blen);
5077  sp = RSTRING_PTR(str);
5078  slen = RSTRING_LEN(str);
5079  cp = sp;
5080  str_enc = STR_ENC_GET(str);
5081  rb_enc_associate(dest, str_enc);
5083 
5084  do {
5085  match = rb_backref_get();
5086  regs = RMATCH_REGS(match);
5087  if (RB_TYPE_P(pat, T_STRING)) {
5088  beg0 = beg;
5089  end0 = beg0 + RSTRING_LEN(pat);
5090  match0 = pat;
5091  }
5092  else {
5093  beg0 = BEG(0);
5094  end0 = END(0);
5095  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5096  }
5097 
5098  if (mode) {
5099  if (mode == ITER) {
5100  val = rb_obj_as_string(rb_yield(match0));
5101  }
5102  else {
5103  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5104  val = rb_obj_as_string(val);
5105  }
5106  str_mod_check(str, sp, slen);
5107  if (val == dest) { /* paranoid check [ruby-dev:24827] */
5108  rb_raise(rb_eRuntimeError, "block should not cheat");
5109  }
5110  }
5111  else if (need_backref) {
5112  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5113  if (need_backref < 0) {
5114  need_backref = val != repl;
5115  }
5116  }
5117  else {
5118  val = repl;
5119  }
5120 
5121  tainted |= OBJ_TAINTED_RAW(val);
5122 
5123  len = beg0 - offset; /* copy pre-match substr */
5124  if (len) {
5125  rb_enc_str_buf_cat(dest, cp, len, str_enc);
5126  }
5127 
5128  rb_str_buf_append(dest, val);
5129 
5130  last = offset;
5131  offset = end0;
5132  if (beg0 == end0) {
5133  /*
5134  * Always consume at least one character of the input string
5135  * in order to prevent infinite loops.
5136  */
5137  if (RSTRING_LEN(str) <= end0) break;
5138  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5139  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5140  offset = end0 + len;
5141  }
5142  cp = RSTRING_PTR(str) + offset;
5143  if (offset > RSTRING_LEN(str)) break;
5144  beg = rb_pat_search(pat, str, offset, need_backref);
5145  } while (beg >= 0);
5146  if (RSTRING_LEN(str) > offset) {
5147  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5148  }
5149  rb_pat_search(pat, str, last, 1);
5150  if (bang) {
5151  str_shared_replace(str, dest);
5152  }
5153  else {
5154  RBASIC_SET_CLASS(dest, rb_obj_class(str));
5155  tainted |= OBJ_TAINTED_RAW(str);
5156  str = dest;
5157  }
5158 
5159  FL_SET_RAW(str, tainted);
5160  return str;
5161 }
5162 
5163 
5164 /*
5165  * call-seq:
5166  * str.gsub!(pattern, replacement) -> str or nil
5167  * str.gsub!(pattern, hash) -> str or nil
5168  * str.gsub!(pattern) {|match| block } -> str or nil
5169  * str.gsub!(pattern) -> an_enumerator
5170  *
5171  * Performs the substitutions of <code>String#gsub</code> in place, returning
5172  * <i>str</i>, or <code>nil</code> if no substitutions were performed.
5173  * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
5174  */
5175 
5176 static VALUE
5177 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5178 {
5179  str_modify_keep_cr(str);
5180  return str_gsub(argc, argv, str, 1);
5181 }
5182 
5183 
5184 /*
5185  * call-seq:
5186  * str.gsub(pattern, replacement) -> new_str
5187  * str.gsub(pattern, hash) -> new_str
5188  * str.gsub(pattern) {|match| block } -> new_str
5189  * str.gsub(pattern) -> enumerator
5190  *
5191  * Returns a copy of <i>str</i> with <em>all</em> occurrences of
5192  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5193  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
5194  * regular expression metacharacters it contains will be interpreted
5195  * literally, e.g. <code>'\\\d'</code> will match a backslash followed by 'd',
5196  * instead of a digit.
5197  *
5198  * If <i>replacement</i> is a <code>String</code> it will be substituted for
5199  * the matched text. It may contain back-references to the pattern's capture
5200  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
5201  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
5202  * double-quoted string, both back-references must be preceded by an
5203  * additional backslash. However, within <i>replacement</i> the special match
5204  * variables, such as <code>$&</code>, will not refer to the current match.
5205  *
5206  * If the second argument is a <code>Hash</code>, and the matched text is one
5207  * of its keys, the corresponding value is the replacement string.
5208  *
5209  * In the block form, the current match string is passed in as a parameter,
5210  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5211  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5212  * returned by the block will be substituted for the match on each call.
5213  *
5214  * The result inherits any tainting in the original string or any supplied
5215  * replacement string.
5216  *
5217  * When neither a block nor a second argument is supplied, an
5218  * <code>Enumerator</code> is returned.
5219  *
5220  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
5221  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
5222  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
5223  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
5224  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
5225  */
5226 
5227 static VALUE
5228 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5229 {
5230  return str_gsub(argc, argv, str, 0);
5231 }
5232 
5233 
5234 /*
5235  * call-seq:
5236  * str.replace(other_str) -> str
5237  *
5238  * Replaces the contents and taintedness of <i>str</i> with the corresponding
5239  * values in <i>other_str</i>.
5240  *
5241  * s = "hello" #=> "hello"
5242  * s.replace "world" #=> "world"
5243  */
5244 
5245 VALUE
5247 {
5248  str_modifiable(str);
5249  if (str == str2) return str;
5250 
5251  StringValue(str2);
5252  str_discard(str);
5253  return str_replace(str, str2);
5254 }
5255 
5256 /*
5257  * call-seq:
5258  * string.clear -> string
5259  *
5260  * Makes string empty.
5261  *
5262  * a = "abcde"
5263  * a.clear #=> ""
5264  */
5265 
5266 static VALUE
5267 rb_str_clear(VALUE str)
5268 {
5269  str_discard(str);
5270  STR_SET_EMBED(str);
5271  STR_SET_EMBED_LEN(str, 0);
5272  RSTRING_PTR(str)[0] = 0;
5273  if (rb_enc_asciicompat(STR_ENC_GET(str)))
5275  else
5277  return str;
5278 }
5279 
5280 /*
5281  * call-seq:
5282  * string.chr -> string
5283  *
5284  * Returns a one-character string at the beginning of the string.
5285  *
5286  * a = "abcde"
5287  * a.chr #=> "a"
5288  */
5289 
5290 static VALUE
5291 rb_str_chr(VALUE str)
5292 {
5293  return rb_str_substr(str, 0, 1);
5294 }
5295 
5296 /*
5297  * call-seq:
5298  * str.getbyte(index) -> 0 .. 255
5299  *
5300  * returns the <i>index</i>th byte as an integer.
5301  */
5302 static VALUE
5303 rb_str_getbyte(VALUE str, VALUE index)
5304 {
5305  long pos = NUM2LONG(index);
5306 
5307  if (pos < 0)
5308  pos += RSTRING_LEN(str);
5309  if (pos < 0 || RSTRING_LEN(str) <= pos)
5310  return Qnil;
5311 
5312  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5313 }
5314 
5315 /*
5316  * call-seq:
5317  * str.setbyte(index, integer) -> integer
5318  *
5319  * modifies the <i>index</i>th byte as <i>integer</i>.
5320  */
5321 static VALUE
5322 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5323 {
5324  long pos = NUM2LONG(index);
5325  int byte = NUM2INT(value);
5326  long len = RSTRING_LEN(str);
5327  char *head, *ptr, *left = 0;
5328  rb_encoding *enc;
5329  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5330 
5331  if (pos < -len || len <= pos)
5332  rb_raise(rb_eIndexError, "index %ld out of string", pos);
5333  if (pos < 0)
5334  pos += len;
5335 
5336  if (!str_independent(str))
5337  str_make_independent(str);
5338  enc = STR_ENC_GET(str);
5339  head = RSTRING_PTR(str);
5340  ptr = &head[pos];
5341  if (!STR_EMBED_P(str)) {
5342  cr = ENC_CODERANGE(str);
5343  switch (cr) {
5344  case ENC_CODERANGE_7BIT:
5345  left = ptr;
5346  *ptr = byte;
5347  if (ISASCII(byte)) goto end;
5348  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5349  if (!MBCLEN_CHARFOUND_P(nlen))
5351  else
5353  goto end;
5354  case ENC_CODERANGE_VALID:
5355  left = rb_enc_left_char_head(head, ptr, head+len, enc);
5356  width = rb_enc_precise_mbclen(left, head+len, enc);
5357  *ptr = byte;
5358  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5359  if (!MBCLEN_CHARFOUND_P(nlen))
5361  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5362  ENC_CODERANGE_CLEAR(str);
5363  goto end;
5364  }
5365  }
5366  ENC_CODERANGE_CLEAR(str);
5367  *ptr = byte;
5368 
5369  end:
5370  return value;
5371 }
5372 
5373 static VALUE
5374 str_byte_substr(VALUE str, long beg, long len, int empty)
5375 {
5376  char *p, *s = RSTRING_PTR(str);
5377  long n = RSTRING_LEN(str);
5378  VALUE str2;
5379 
5380  if (beg > n || len < 0) return Qnil;
5381  if (beg < 0) {
5382  beg += n;
5383  if (beg < 0) return Qnil;
5384  }
5385  if (len > n - beg)
5386  len = n - beg;
5387  if (len <= 0) {
5388  if (!empty) return Qnil;
5389  len = 0;
5390  p = 0;
5391  }
5392  else
5393  p = s + beg;
5394 
5395  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5396  str2 = rb_str_new_frozen(str);
5397  str2 = str_new_shared(rb_obj_class(str2), str2);
5398  RSTRING(str2)->as.heap.ptr += beg;
5399  RSTRING(str2)->as.heap.len = len;
5400  }
5401  else {
5402  str2 = rb_str_new_with_class(str, p, len);
5403  }
5404 
5405  str_enc_copy(str2, str);
5406 
5407  if (RSTRING_LEN(str2) == 0) {
5408  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
5410  else
5412  }
5413  else {
5414  switch (ENC_CODERANGE(str)) {
5415  case ENC_CODERANGE_7BIT:
5417  break;
5418  default:
5420  break;
5421  }
5422  }
5423 
5424  OBJ_INFECT_RAW(str2, str);
5425 
5426  return str2;
5427 }
5428 
5429 static VALUE
5430 str_byte_aref(VALUE str, VALUE indx)
5431 {
5432  long idx;
5433  if (FIXNUM_P(indx)) {
5434  idx = FIX2LONG(indx);
5435  }
5436  else {
5437  /* check if indx is Range */
5438  long beg, len = RSTRING_LEN(str);
5439 
5440  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5441  case Qfalse:
5442  break;
5443  case Qnil:
5444  return Qnil;
5445  default:
5446  return str_byte_substr(str, beg, len, TRUE);
5447  }
5448 
5449  idx = NUM2LONG(indx);
5450  }
5451  return str_byte_substr(str, idx, 1, FALSE);
5452 }
5453 
5454 /*
5455  * call-seq:
5456  * str.byteslice(integer) -> new_str or nil
5457  * str.byteslice(integer, integer) -> new_str or nil
5458  * str.byteslice(range) -> new_str or nil
5459  *
5460  * Byte Reference---If passed a single <code>Integer</code>, returns a
5461  * substring of one byte at that position. If passed two <code>Integer</code>
5462  * objects, returns a substring starting at the offset given by the first, and
5463  * a length given by the second. If given a <code>Range</code>, a substring containing
5464  * bytes at offsets given by the range is returned. In all three cases, if
5465  * an offset is negative, it is counted from the end of <i>str</i>. Returns
5466  * <code>nil</code> if the initial offset falls outside the string, the length
5467  * is negative, or the beginning of the range is greater than the end.
5468  * The encoding of the resulted string keeps original encoding.
5469  *
5470  * "hello".byteslice(1) #=> "e"
5471  * "hello".byteslice(-1) #=> "o"
5472  * "hello".byteslice(1, 2) #=> "el"
5473  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5474  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5475  */
5476 
5477 static VALUE
5478 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
5479 {
5480  if (argc == 2) {
5481  long beg = NUM2LONG(argv[0]);
5482  long end = NUM2LONG(argv[1]);
5483  return str_byte_substr(str, beg, end, TRUE);
5484  }
5485  rb_check_arity(argc, 1, 2);
5486  return str_byte_aref(str, argv[0]);
5487 }
5488 
5489 /*
5490  * call-seq:
5491  * str.reverse -> new_str
5492  *
5493  * Returns a new string with the characters from <i>str</i> in reverse order.
5494  *
5495  * "stressed".reverse #=> "desserts"
5496  */
5497 
5498 static VALUE
5499 rb_str_reverse(VALUE str)
5500 {
5501  rb_encoding *enc;
5502  VALUE rev;
5503  char *s, *e, *p;
5504  int cr;
5505 
5506  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
5507  enc = STR_ENC_GET(str);
5508  rev = rb_str_new_with_class(str, 0, RSTRING_LEN(str));
5509  s = RSTRING_PTR(str); e = RSTRING_END(str);
5510  p = RSTRING_END(rev);
5511  cr = ENC_CODERANGE(str);
5512 
5513  if (RSTRING_LEN(str) > 1) {
5514  if (single_byte_optimizable(str)) {
5515  while (s < e) {
5516  *--p = *s++;
5517  }
5518  }
5519  else if (cr == ENC_CODERANGE_VALID) {
5520  while (s < e) {
5521  int clen = rb_enc_fast_mbclen(s, e, enc);
5522 
5523  p -= clen;
5524  memcpy(p, s, clen);
5525  s += clen;
5526  }
5527  }
5528  else {
5529  cr = rb_enc_asciicompat(enc) ?
5531  while (s < e) {
5532  int clen = rb_enc_mbclen(s, e, enc);
5533 
5534  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5535  p -= clen;
5536  memcpy(p, s, clen);
5537  s += clen;
5538  }
5539  }
5540  }
5541  STR_SET_LEN(rev, RSTRING_LEN(str));
5542  OBJ_INFECT_RAW(rev, str);
5543  str_enc_copy(rev, str);
5544  ENC_CODERANGE_SET(rev, cr);
5545 
5546  return rev;
5547 }
5548 
5549 
5550 /*
5551  * call-seq:
5552  * str.reverse! -> str
5553  *
5554  * Reverses <i>str</i> in place.
5555  */
5556 
5557 static VALUE
5558 rb_str_reverse_bang(VALUE str)
5559 {
5560  if (RSTRING_LEN(str) > 1) {
5561  if (single_byte_optimizable(str)) {
5562  char *s, *e, c;
5563 
5564  str_modify_keep_cr(str);
5565  s = RSTRING_PTR(str);
5566  e = RSTRING_END(str) - 1;
5567  while (s < e) {
5568  c = *s;
5569  *s++ = *e;
5570  *e-- = c;
5571  }
5572  }
5573  else {
5574  str_shared_replace(str, rb_str_reverse(str));
5575  }
5576  }
5577  else {
5578  str_modify_keep_cr(str);
5579  }
5580  return str;
5581 }
5582 
5583 
5584 /*
5585  * call-seq:
5586  * str.include? other_str -> true or false
5587  *
5588  * Returns <code>true</code> if <i>str</i> contains the given string or
5589  * character.
5590  *
5591  * "hello".include? "lo" #=> true
5592  * "hello".include? "ol" #=> false
5593  * "hello".include? ?h #=> true
5594  */
5595 
5596 static VALUE
5597 rb_str_include(VALUE str, VALUE arg)
5598 {
5599  long i;
5600 
5601  StringValue(arg);
5602  i = rb_str_index(str, arg, 0);
5603 
5604  if (i == -1) return Qfalse;
5605  return Qtrue;
5606 }
5607 
5608 
5609 /*
5610  * call-seq:
5611  * str.to_i(base=10) -> integer
5612  *
5613  * Returns the result of interpreting leading characters in <i>str</i> as an
5614  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
5615  * end of a valid number are ignored. If there is not a valid number at the
5616  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
5617  * exception when <i>base</i> is valid.
5618  *
5619  * "12345".to_i #=> 12345
5620  * "99 red balloons".to_i #=> 99
5621  * "0a".to_i #=> 0
5622  * "0a".to_i(16) #=> 10
5623  * "hello".to_i #=> 0
5624  * "1100101".to_i(2) #=> 101
5625  * "1100101".to_i(8) #=> 294977
5626  * "1100101".to_i(10) #=> 1100101
5627  * "1100101".to_i(16) #=> 17826049
5628  */
5629 
5630 static VALUE
5631 rb_str_to_i(int argc, VALUE *argv, VALUE str)
5632 {
5633  int base;
5634 
5635  if (argc == 0) base = 10;
5636  else {
5637  VALUE b;
5638 
5639  rb_scan_args(argc, argv, "01", &b);
5640  base = NUM2INT(b);
5641  }
5642  if (base < 0) {
5643  rb_raise(rb_eArgError, "invalid radix %d", base);
5644  }
5645  return rb_str_to_inum(str, base, FALSE);
5646 }
5647 
5648 
5649 /*
5650  * call-seq:
5651  * str.to_f -> float
5652  *
5653  * Returns the result of interpreting leading characters in <i>str</i> as a
5654  * floating point number. Extraneous characters past the end of a valid number
5655  * are ignored. If there is not a valid number at the start of <i>str</i>,
5656  * <code>0.0</code> is returned. This method never raises an exception.
5657  *
5658  * "123.45e1".to_f #=> 1234.5
5659  * "45.67 degrees".to_f #=> 45.67
5660  * "thx1138".to_f #=> 0.0
5661  */
5662 
5663 static VALUE
5664 rb_str_to_f(VALUE str)
5665 {
5666  return DBL2NUM(rb_str_to_dbl(str, FALSE));
5667 }
5668 
5669 
5670 /*
5671  * call-seq:
5672  * str.to_s -> str
5673  * str.to_str -> str
5674  *
5675  * Returns +self+.
5676  *
5677  * If called on a subclass of String, converts the receiver to a String object.
5678  */
5679 
5680 static VALUE
5681 rb_str_to_s(VALUE str)
5682 {
5683  if (rb_obj_class(str) != rb_cString) {
5684  return str_duplicate(rb_cString, str);
5685  }
5686  return str;
5687 }
5688 
5689 #if 0
5690 static void
5691 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
5692 {
5693  char s[RUBY_MAX_CHAR_LEN];
5694  int n = rb_enc_codelen(c, enc);
5695 
5696  rb_enc_mbcput(c, s, enc);
5697  rb_enc_str_buf_cat(str, s, n, enc);
5698 }
5699 #endif
5700 
5701 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
5702 
5703 int
5704 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
5705 {
5706  char buf[CHAR_ESC_LEN + 1];
5707  int l;
5708 
5709 #if SIZEOF_INT > 4
5710  c &= 0xffffffff;
5711 #endif
5712  if (unicode_p) {
5713  if (c < 0x7F && ISPRINT(c)) {
5714  snprintf(buf, CHAR_ESC_LEN, "%c", c);
5715  }
5716  else if (c < 0x10000) {
5717  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
5718  }
5719  else {
5720  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
5721  }
5722  }
5723  else {
5724  if (c < 0x100) {
5725  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
5726  }
5727  else {
5728  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
5729  }
5730  }
5731  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
5732  rb_str_buf_cat(result, buf, l);
5733  return l;
5734 }
5735 
5736 VALUE
5738 {
5739  int encidx = ENCODING_GET(str);
5740  rb_encoding *enc = rb_enc_from_index(encidx);
5741  const char *p = RSTRING_PTR(str);
5742  const char *pend = RSTRING_END(str);
5743  const char *prev = p;
5744  char buf[CHAR_ESC_LEN + 1];
5745  VALUE result = rb_str_buf_new(0);
5746  int unicode_p = rb_enc_unicode_p(enc);
5747  int asciicompat = rb_enc_asciicompat(enc);
5748 
5749  while (p < pend) {
5750  unsigned int c, cc;
5751  int n = rb_enc_precise_mbclen(p, pend, enc);
5752  if (!MBCLEN_CHARFOUND_P(n)) {
5753  if (p > prev) str_buf_cat(result, prev, p - prev);
5754  n = rb_enc_mbminlen(enc);
5755  if (pend < p + n)
5756  n = (int)(pend - p);
5757  while (n--) {
5758  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5759  str_buf_cat(result, buf, strlen(buf));
5760  prev = ++p;
5761  }
5762  continue;
5763  }
5764  n = MBCLEN_CHARFOUND_LEN(n);
5765  c = rb_enc_mbc_to_codepoint(p, pend, enc);
5766  p += n;
5767  switch (c) {
5768  case '\n': cc = 'n'; break;
5769  case '\r': cc = 'r'; break;
5770  case '\t': cc = 't'; break;
5771  case '\f': cc = 'f'; break;
5772  case '\013': cc = 'v'; break;
5773  case '\010': cc = 'b'; break;
5774  case '\007': cc = 'a'; break;
5775  case 033: cc = 'e'; break;
5776  default: cc = 0; break;
5777  }
5778  if (cc) {
5779  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5780  buf[0] = '\\';
5781  buf[1] = (char)cc;
5782  str_buf_cat(result, buf, 2);
5783  prev = p;
5784  }
5785  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
5786  }
5787  else {
5788  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5789  rb_str_buf_cat_escaped_char(result, c, unicode_p);
5790  prev = p;
5791  }
5792  }
5793  if (p > prev) str_buf_cat(result, prev, p - prev);
5795 
5796  OBJ_INFECT_RAW(result, str);
5797  return result;
5798 }
5799 
5800 /*
5801  * call-seq:
5802  * str.inspect -> string
5803  *
5804  * Returns a printable version of _str_, surrounded by quote marks,
5805  * with special characters escaped.
5806  *
5807  * str = "hello"
5808  * str[3] = "\b"
5809  * str.inspect #=> "\"hel\\bo\""
5810  */
5811 
5812 VALUE
5814 {
5815  int encidx = ENCODING_GET(str);
5816  rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
5817  const char *p, *pend, *prev;
5818  char buf[CHAR_ESC_LEN + 1];
5819  VALUE result = rb_str_buf_new(0);
5821  int unicode_p = rb_enc_unicode_p(enc);
5822  int asciicompat = rb_enc_asciicompat(enc);
5823 
5824  if (resenc == NULL) resenc = rb_default_external_encoding();
5825  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
5826  rb_enc_associate(result, resenc);
5827  str_buf_cat2(result, "\"");
5828 
5829  p = RSTRING_PTR(str); pend = RSTRING_END(str);
5830  prev = p;
5831  actenc = get_actual_encoding(encidx, str);
5832  if (actenc != enc) {
5833  enc = actenc;
5834  if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
5835  }
5836  while (p < pend) {
5837  unsigned int c, cc;
5838  int n;
5839 
5840  n = rb_enc_precise_mbclen(p, pend, enc);
5841  if (!MBCLEN_CHARFOUND_P(n)) {
5842  if (p > prev) str_buf_cat(result, prev, p - prev);
5843  n = rb_enc_mbminlen(enc);
5844  if (pend < p + n)
5845  n = (int)(pend - p);
5846  while (n--) {
5847  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5848  str_buf_cat(result, buf, strlen(buf));
5849  prev = ++p;
5850  }
5851  continue;
5852  }
5853  n = MBCLEN_CHARFOUND_LEN(n);
5854  c = rb_enc_mbc_to_codepoint(p, pend, enc);
5855  p += n;
5856  if ((asciicompat || unicode_p) &&
5857  (c == '"'|| c == '\\' ||
5858  (c == '#' &&
5859  p < pend &&
5861  (cc = rb_enc_codepoint(p,pend,enc),
5862  (cc == '$' || cc == '@' || cc == '{'))))) {
5863  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5864  str_buf_cat2(result, "\\");
5865  if (asciicompat || enc == resenc) {
5866  prev = p - n;
5867  continue;
5868  }
5869  }
5870  switch (c) {
5871  case '\n': cc = 'n'; break;
5872  case '\r': cc = 'r'; break;
5873  case '\t': cc = 't'; break;
5874  case '\f': cc = 'f'; break;
5875  case '\013': cc = 'v'; break;
5876  case '\010': cc = 'b'; break;
5877  case '\007': cc = 'a'; break;
5878  case 033: cc = 'e'; break;
5879  default: cc = 0; break;
5880  }
5881  if (cc) {
5882  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5883  buf[0] = '\\';
5884  buf[1] = (char)cc;
5885  str_buf_cat(result, buf, 2);
5886  prev = p;
5887  continue;
5888  }
5889  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
5890  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
5891  continue;
5892  }
5893  else {
5894  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5895  rb_str_buf_cat_escaped_char(result, c, unicode_p);
5896  prev = p;
5897  continue;
5898  }
5899  }
5900  if (p > prev) str_buf_cat(result, prev, p - prev);
5901  str_buf_cat2(result, "\"");
5902 
5903  OBJ_INFECT_RAW(result, str);
5904  return result;
5905 }
5906 
5907 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
5908 
5909 /*
5910  * call-seq:
5911  * str.dump -> new_str
5912  *
5913  * Produces a version of +str+ with all non-printing characters replaced by
5914  * <code>\nnn</code> notation and all special characters escaped.
5915  *
5916  * "hello \n ''".dump #=> "\"hello \\n ''\""
5917  */
5918 
5919 VALUE
5921 {
5922  int encidx = rb_enc_get_index(str);
5923  rb_encoding *enc = rb_enc_from_index(encidx);
5924  long len;
5925  const char *p, *pend;
5926  char *q, *qend;
5927  VALUE result;
5928  int u8 = (encidx == rb_utf8_encindex());
5929  static const char nonascii_suffix[] = ".force_encoding(\"%s\")";
5930 
5931  len = 2; /* "" */
5932  if (!rb_enc_asciicompat(enc)) {
5933  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
5934  len += strlen(enc->name);
5935  }
5936 
5937  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
5938  while (p < pend) {
5939  int clen;
5940  unsigned char c = *p++;
5941 
5942  switch (c) {
5943  case '"': case '\\':
5944  case '\n': case '\r':
5945  case '\t': case '\f':
5946  case '\013': case '\010': case '\007': case '\033':
5947  clen = 2;
5948  break;
5949 
5950  case '#':
5951  clen = IS_EVSTR(p, pend) ? 2 : 1;
5952  break;
5953 
5954  default:
5955  if (ISPRINT(c)) {
5956  clen = 1;
5957  }
5958  else {
5959  if (u8 && c > 0x7F) { /* \u notation */
5960  int n = rb_enc_precise_mbclen(p-1, pend, enc);
5961  if (MBCLEN_CHARFOUND_P(n)) {
5962  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
5963  if (cc <= 0xFFFF)
5964  clen = 6; /* \uXXXX */
5965  else if (cc <= 0xFFFFF)
5966  clen = 9; /* \u{XXXXX} */
5967  else
5968  clen = 10; /* \u{XXXXXX} */
5969  p += MBCLEN_CHARFOUND_LEN(n)-1;
5970  break;
5971  }
5972  }
5973  clen = 4; /* \xNN */
5974  }
5975  break;
5976  }
5977 
5978  if (clen > LONG_MAX - len) {
5979  rb_raise(rb_eRuntimeError, "string size too big");
5980  }
5981  len += clen;
5982  }
5983 
5984  result = rb_str_new_with_class(str, 0, len);
5985  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
5986  q = RSTRING_PTR(result); qend = q + len + 1;
5987 
5988  *q++ = '"';
5989  while (p < pend) {
5990  unsigned char c = *p++;
5991 
5992  if (c == '"' || c == '\\') {
5993  *q++ = '\\';
5994  *q++ = c;
5995  }
5996  else if (c == '#') {
5997  if (IS_EVSTR(p, pend)) *q++ = '\\';
5998  *q++ = '#';
5999  }
6000  else if (c == '\n') {
6001  *q++ = '\\';
6002  *q++ = 'n';
6003  }
6004  else if (c == '\r') {
6005  *q++ = '\\';
6006  *q++ = 'r';
6007  }
6008  else if (c == '\t') {
6009  *q++ = '\\';
6010  *q++ = 't';
6011  }
6012  else if (c == '\f') {
6013  *q++ = '\\';
6014  *q++ = 'f';
6015  }
6016  else if (c == '\013') {
6017  *q++ = '\\';
6018  *q++ = 'v';
6019  }
6020  else if (c == '\010') {
6021  *q++ = '\\';
6022  *q++ = 'b';
6023  }
6024  else if (c == '\007') {
6025  *q++ = '\\';
6026  *q++ = 'a';
6027  }
6028  else if (c == '\033') {
6029  *q++ = '\\';
6030  *q++ = 'e';
6031  }
6032  else if (ISPRINT(c)) {
6033  *q++ = c;
6034  }
6035  else {
6036  *q++ = '\\';
6037  if (u8) {
6038  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6039  if (MBCLEN_CHARFOUND_P(n)) {
6040  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6041  p += n;
6042  if (cc <= 0xFFFF)
6043  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6044  else
6045  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6046  q += strlen(q);
6047  continue;
6048  }
6049  }
6050  snprintf(q, qend-q, "x%02X", c);
6051  q += 3;
6052  }
6053  }
6054  *q++ = '"';
6055  *q = '\0';
6056  if (!rb_enc_asciicompat(enc)) {
6057  snprintf(q, qend-q, nonascii_suffix, enc->name);
6058  encidx = rb_ascii8bit_encindex();
6059  }
6060  OBJ_INFECT_RAW(result, str);
6061  /* result from dump is ASCII */
6062  rb_enc_associate_index(result, encidx);
6064  return result;
6065 }
6066 
6067 
6068 static void
6069 rb_str_check_dummy_enc(rb_encoding *enc)
6070 {
6071  if (rb_enc_dummy_p(enc)) {
6072  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6073  rb_enc_name(enc));
6074  }
6075 }
6076 
6077 static OnigCaseFoldType
6078 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6079 {
6080  if (argc==0)
6081  return flags;
6082  if (argc>2)
6083  rb_raise(rb_eArgError, "too many options");
6084  if (argv[0]==sym_turkic) {
6086  if (argc==2) {
6087  if (argv[1]==sym_lithuanian)
6089  else
6090  rb_raise(rb_eArgError, "invalid second option");
6091  }
6092  }
6093  else if (argv[0]==sym_lithuanian) {
6095  if (argc==2) {
6096  if (argv[1]==sym_turkic)
6098  else
6099  rb_raise(rb_eArgError, "invalid second option");
6100  }
6101  }
6102  else if (argc>1)
6103  rb_raise(rb_eArgError, "too many options");
6104  else if (argv[0]==sym_ascii)
6105  flags |= ONIGENC_CASE_ASCII_ONLY;
6106  else if (argv[0]==sym_fold) {
6109  else
6110  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6111  }
6112  else
6113  rb_raise(rb_eArgError, "invalid option");
6114  return flags;
6115 }
6116 
6117 /* 16 should be long enough to absorb any kind of single character length increase */
6118 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
6119 #ifndef CASEMAP_DEBUG
6120 # define CASEMAP_DEBUG 0
6121 #endif
6122 
6123 struct mapping_buffer;
6124 typedef struct mapping_buffer {
6125  size_t capa;
6126  size_t used;
6128  OnigUChar space[1];
6129 } mapping_buffer;
6130 
6131 static VALUE
6132 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6133 {
6134  VALUE target;
6135 
6136  OnigUChar *source_current, *source_end;
6137  int target_length = 0;
6138  mapping_buffer pre_buffer, /* only next pointer used */
6139  *current_buffer = &pre_buffer;
6140  size_t buffer_count = 0;
6141  int buffer_length_or_invalid;
6142 
6143  if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
6144 
6145  source_current = (OnigUChar*)RSTRING_PTR(source);
6146  source_end = (OnigUChar*)RSTRING_END(source);
6147 
6148  while (source_current < source_end) {
6149  /* increase multiplier using buffer count to converge quickly */
6150  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6151  if (CASEMAP_DEBUG) {
6152  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6153  }
6154  current_buffer->next = xmalloc(offsetof(mapping_buffer, space) + capa);
6155  current_buffer = current_buffer->next;
6156  current_buffer->next = NULL;
6157  current_buffer->capa = capa;
6158  buffer_length_or_invalid = enc->case_map(flags,
6159  (const OnigUChar**)&source_current, source_end,
6160  current_buffer->space,
6161  current_buffer->space+current_buffer->capa,
6162  enc);
6163  if (buffer_length_or_invalid < 0) {
6164  mapping_buffer *previous_buffer;
6165 
6166  current_buffer = pre_buffer.next;
6167  while (current_buffer) {
6168  previous_buffer = current_buffer;
6169  current_buffer = current_buffer->next;
6170  xfree(previous_buffer);
6171  }
6172  rb_raise(rb_eArgError, "input string invalid");
6173  }
6174  target_length += current_buffer->used = buffer_length_or_invalid;
6175  }
6176  if (CASEMAP_DEBUG) {
6177  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6178  }
6179 
6180  if (buffer_count==1) {
6181  target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
6182  xfree(current_buffer);
6183  }
6184  else {
6185  char *target_current;
6186  mapping_buffer *previous_buffer;
6187 
6188  target = rb_str_new_with_class(source, 0, target_length);
6189  target_current = RSTRING_PTR(target);
6190  current_buffer=pre_buffer.next;
6191  while (current_buffer) {
6192  memcpy(target_current, current_buffer->space, current_buffer->used);
6193  target_current += current_buffer->used;
6194  previous_buffer = current_buffer;
6195  current_buffer = current_buffer->next;
6196  xfree(previous_buffer);
6197  }
6198  }
6199 
6200  /* TODO: check about string terminator character */
6201  OBJ_INFECT_RAW(target, source);
6202  str_enc_copy(target, source);
6203  /*ENC_CODERANGE_SET(mapped, cr);*/
6204 
6205  return target;
6206 }
6207 
6208 static void
6209 rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6210 {
6211  OnigUChar *source_current, *source_end;
6212  long old_length = RSTRING_LEN(source);
6213  int length_or_invalid;
6214 
6215  if (old_length == 0) return;
6216 
6217  source_current = (OnigUChar*)RSTRING_PTR(source);
6218  source_end = (OnigUChar*)RSTRING_END(source);
6219 
6220  length_or_invalid = onigenc_ascii_only_case_map(flags,
6221  (const OnigUChar**)&source_current, source_end,
6222  source_current, source_end, enc);
6223  if (length_or_invalid < 0)
6224  rb_raise(rb_eArgError, "input string invalid");
6225  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6226  fprintf(stderr, "problem with rb_str_ascii_casemap"
6227  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6228  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6229  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6230  }
6231 }
6232 
6233 /*
6234  * call-seq:
6235  * str.upcase! -> str or nil
6236  * str.upcase!([options]) -> str or nil
6237  *
6238  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6239  * were made.
6240  *
6241  * See String#downcase for meaning of +options+ and use with different encodings.
6242  */
6243 
6244 static VALUE
6245 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
6246 {
6247  rb_encoding *enc;
6249 
6250  flags = check_case_options(argc, argv, flags);
6251  str_modify_keep_cr(str);
6252  enc = STR_ENC_GET(str);
6253  rb_str_check_dummy_enc(enc);
6254  if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6256  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6257 
6258  while (s < send) {
6259  unsigned int c = *(unsigned char*)s;
6260 
6261  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6262  *s = 'A' + (c - 'a');
6263  flags |= ONIGENC_CASE_MODIFIED;
6264  }
6265  s++;
6266  }
6267  }
6268  else if (flags&ONIGENC_CASE_ASCII_ONLY)
6269  rb_str_ascii_casemap(str, &flags, enc);
6270  else
6271  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6272 
6273  if (ONIGENC_CASE_MODIFIED&flags) return str;
6274  return Qnil;
6275 }
6276 
6277 
6278 /*
6279  * call-seq:
6280  * str.upcase -> new_str
6281  * str.upcase([options]) -> new_str
6282  *
6283  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
6284  * uppercase counterparts.
6285  *
6286  * See String#downcase for meaning of +options+ and use with different encodings.
6287  *
6288  * "hEllO".upcase #=> "HELLO"
6289  */
6290 
6291 static VALUE
6292 rb_str_upcase(int argc, VALUE *argv, VALUE str)
6293 {
6294  str = rb_str_dup(str);
6295  rb_str_upcase_bang(argc, argv, str);
6296  return str;
6297 }
6298 
6299 /*
6300  * call-seq:
6301  * str.downcase! -> str or nil
6302  * str.downcase!([options]) -> str or nil
6303  *
6304  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
6305  * changes were made.
6306  *
6307  * See String#downcase for meaning of +options+ and use with different encodings.
6308  */
6309 
6310 static VALUE
6311 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
6312 {
6313  rb_encoding *enc;
6315 
6316  flags = check_case_options(argc, argv, flags);
6317  str_modify_keep_cr(str);
6318  enc = STR_ENC_GET(str);
6319  rb_str_check_dummy_enc(enc);
6320  if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6322  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6323 
6324  while (s < send) {
6325  unsigned int c = *(unsigned char*)s;
6326 
6327  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6328  *s = 'a' + (c - 'A');
6329  flags |= ONIGENC_CASE_MODIFIED;
6330  }
6331  s++;
6332  }
6333  }
6334  else if (flags&ONIGENC_CASE_ASCII_ONLY)
6335  rb_str_ascii_casemap(str, &flags, enc);
6336  else
6337  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6338 
6339  if (ONIGENC_CASE_MODIFIED&flags) return str;
6340  return Qnil;
6341 }
6342 
6343 
6344 /*
6345  * call-seq:
6346  * str.downcase -> new_str
6347  * str.downcase([options]) -> new_str
6348  *
6349  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
6350  * lowercase counterparts. Which letters exactly are replaced, and by which
6351  * other letters, depends on the presence or absence of options, and on the
6352  * +encoding+ of the string.
6353  *
6354  * The meaning of the +options+ is as follows:
6355  *
6356  * No option ::
6357  * Full Unicode case mapping, suitable for most languages
6358  * (see :turkic and :lithuanian options below for exceptions).
6359  * Context-dependent case mapping as described in Table 3-14 of the
6360  * Unicode standard is currently not supported.
6361  * :ascii ::
6362  * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
6363  * ``a'' to ``z'', are affected.
6364  * This option cannot be combined with any other option.
6365  * :turkic ::
6366  * Full Unicode case mapping, adapted for Turkic languages
6367  * (Turkish, Aserbaijani,...). This means that upper case I is mapped to
6368  * lower case dotless i, and so on.
6369  * :lithuanian ::
6370  * Currently, just full Unicode case mapping. In the future, full Unicode
6371  * case mapping adapted for Lithuanian (keeping the dot on the lower case
6372  * i even if there is an accent on top).
6373  * :fold ::
6374  * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
6375  * which is more far-reaching than Unicode case mapping.
6376  * This option currently cannot be combined with any other option
6377  * (i.e. there is currenty no variant for turkic languages).
6378  *
6379  * Please note that several assumptions that are valid for ASCII-only case
6380  * conversions do not hold for more general case conversions. For example,
6381  * the length of the result may not be the same as the length of the input
6382  * (neither in characters nor in bytes), some roundtrip assumptions
6383  * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
6384  * normalization (i.e. String#unicode_normalize) is not necessarily maintained
6385  * by case mapping operations.
6386  *
6387  * Non-ASCII case mapping/folding is currently supported for UTF-8,
6388  * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
6389  * This support will be extended to other encodings.
6390  *
6391  * "hEllO".downcase #=> "hello"
6392  */
6393 
6394 static VALUE
6395 rb_str_downcase(int argc, VALUE *argv, VALUE str)
6396 {
6397  str = rb_str_dup(str);
6398  rb_str_downcase_bang(argc, argv, str);
6399  return str;
6400 }
6401 
6402 
6403 /*
6404  * call-seq:
6405  * str.capitalize! -> str or nil
6406  * str.capitalize!([options]) -> str or nil
6407  *
6408  * Modifies <i>str</i> by converting the first character to uppercase and the
6409  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
6410  *
6411  * See String#downcase for meaning of +options+ and use with different encodings.
6412  *
6413  * a = "hello"
6414  * a.capitalize! #=> "Hello"
6415  * a #=> "Hello"
6416  * a.capitalize! #=> nil
6417  */
6418 
6419 static VALUE
6420 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
6421 {
6422  rb_encoding *enc;
6424 
6425  flags = check_case_options(argc, argv, flags);
6426  str_modify_keep_cr(str);
6427  enc = STR_ENC_GET(str);
6428  rb_str_check_dummy_enc(enc);
6429  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6430  if (flags&ONIGENC_CASE_ASCII_ONLY)
6431  rb_str_ascii_casemap(str, &flags, enc);
6432  else
6433  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6434 
6435  if (ONIGENC_CASE_MODIFIED&flags) return str;
6436  return Qnil;
6437 }
6438 
6439 
6440 /*
6441  * call-seq:
6442  * str.capitalize -> new_str
6443  * str.capitalize([options]) -> new_str
6444  *
6445  * Returns a copy of <i>str</i> with the first character converted to uppercase
6446  * and the remainder to lowercase.
6447  *
6448  * See String#downcase for meaning of +options+ and use with different encodings.
6449  *
6450  * "hello".capitalize #=> "Hello"
6451  * "HELLO".capitalize #=> "Hello"
6452  * "123ABC".capitalize #=> "123abc"
6453  */
6454 
6455 static VALUE
6456 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
6457 {
6458  str = rb_str_dup(str);
6459  rb_str_capitalize_bang(argc, argv, str);
6460  return str;
6461 }
6462 
6463 
6464 /*
6465  * call-seq:
6466  * str.swapcase! -> str or nil
6467  * str.swapcase!([options]) -> str or nil
6468  *
6469  * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
6470  * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
6471  *
6472  * See String#downcase for meaning of +options+ and use with different encodings.
6473  */
6474 
6475 static VALUE
6476 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
6477 {
6478  rb_encoding *enc;
6480 
6481  flags = check_case_options(argc, argv, flags);
6482  str_modify_keep_cr(str);
6483  enc = STR_ENC_GET(str);
6484  rb_str_check_dummy_enc(enc);
6485  if (flags&ONIGENC_CASE_ASCII_ONLY)
6486  rb_str_ascii_casemap(str, &flags, enc);
6487  else
6488  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6489 
6490  if (ONIGENC_CASE_MODIFIED&flags) return str;
6491  return Qnil;
6492 }
6493 
6494 
6495 /*
6496  * call-seq:
6497  * str.swapcase -> new_str
6498  * str.swapcase([options]) -> new_str
6499  *
6500  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
6501  * to lowercase and lowercase characters converted to uppercase.
6502  *
6503  * See String#downcase for meaning of +options+ and use with different encodings.
6504  *
6505  * "Hello".swapcase #=> "hELLO"
6506  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
6507  */
6508 
6509 static VALUE
6510 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
6511 {
6512  str = rb_str_dup(str);
6513  rb_str_swapcase_bang(argc, argv, str);
6514  return str;
6515 }
6516 
6517 typedef unsigned char *USTR;
6518 
6519 struct tr {
6520  int gen;
6521  unsigned int now, max;
6522  char *p, *pend;
6523 };
6524 
6525 static unsigned int
6526 trnext(struct tr *t, rb_encoding *enc)
6527 {
6528  int n;
6529 
6530  for (;;) {
6531  if (!t->gen) {
6532 nextpart:
6533  if (t->p == t->pend) return -1;
6534  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
6535  t->p += n;
6536  }
6537  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6538  t->p += n;
6539  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
6540  t->p += n;
6541  if (t->p < t->pend) {
6542  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6543  t->p += n;
6544  if (t->now > c) {
6545  if (t->now < 0x80 && c < 0x80) {
6547  "invalid range \"%c-%c\" in string transliteration",
6548  t->now, c);
6549  }
6550  else {
6551  rb_raise(rb_eArgError, "invalid range in string transliteration");
6552  }
6553  continue; /* not reached */
6554  }
6555  t->gen = 1;
6556  t->max = c;
6557  }
6558  }
6559  return t->now;
6560  }
6561  else {
6562  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
6563  if (t->now == t->max) {
6564  t->gen = 0;
6565  goto nextpart;
6566  }
6567  }
6568  if (t->now < t->max) {
6569  return t->now;
6570  }
6571  else {
6572  t->gen = 0;
6573  return t->max;
6574  }
6575  }
6576  }
6577 }
6578 
6579 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
6580 
6581 static VALUE
6582 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
6583 {
6584  const unsigned int errc = -1;
6585  unsigned int trans[256];
6586  rb_encoding *enc, *e1, *e2;
6587  struct tr trsrc, trrepl;
6588  int cflag = 0;
6589  unsigned int c, c0, last = 0;
6590  int modify = 0, i, l;
6591  char *s, *send;
6592  VALUE hash = 0;
6593  int singlebyte = single_byte_optimizable(str);
6594  int termlen;
6595  int cr;
6596 
6597 #define CHECK_IF_ASCII(c) \
6598  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
6599  (cr = ENC_CODERANGE_VALID) : 0)
6600 
6601  StringValue(src);
6602  StringValue(repl);
6603  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6604  if (RSTRING_LEN(repl) == 0) {
6605  return rb_str_delete_bang(1, &src, str);
6606  }
6607 
6608  cr = ENC_CODERANGE(str);
6609  e1 = rb_enc_check(str, src);
6610  e2 = rb_enc_check(str, repl);
6611  if (e1 == e2) {
6612  enc = e1;
6613  }
6614  else {
6615  enc = rb_enc_check(src, repl);
6616  }
6617  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
6618  if (RSTRING_LEN(src) > 1 &&
6619  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
6620  trsrc.p + l < trsrc.pend) {
6621  cflag = 1;
6622  trsrc.p += l;
6623  }
6624  trrepl.p = RSTRING_PTR(repl);
6625  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
6626  trsrc.gen = trrepl.gen = 0;
6627  trsrc.now = trrepl.now = 0;
6628  trsrc.max = trrepl.max = 0;
6629 
6630  if (cflag) {
6631  for (i=0; i<256; i++) {
6632  trans[i] = 1;
6633  }
6634  while ((c = trnext(&trsrc, enc)) != errc) {
6635  if (c < 256) {
6636  trans[c] = errc;
6637  }
6638  else {
6639  if (!hash) hash = rb_hash_new();
6640  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
6641  }
6642  }
6643  while ((c = trnext(&trrepl, enc)) != errc)
6644  /* retrieve last replacer */;
6645  last = trrepl.now;
6646  for (i=0; i<256; i++) {
6647  if (trans[i] != errc) {
6648  trans[i] = last;
6649  }
6650  }
6651  }
6652  else {
6653  unsigned int r;
6654 
6655  for (i=0; i<256; i++) {
6656  trans[i] = errc;
6657  }
6658  while ((c = trnext(&trsrc, enc)) != errc) {
6659  r = trnext(&trrepl, enc);
6660  if (r == errc) r = trrepl.now;
6661  if (c < 256) {
6662  trans[c] = r;
6663  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
6664  }
6665  else {
6666  if (!hash) hash = rb_hash_new();
6667  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
6668  }
6669  }
6670  }
6671 
6672  if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
6673  cr = ENC_CODERANGE_7BIT;
6674  str_modify_keep_cr(str);
6675  s = RSTRING_PTR(str); send = RSTRING_END(str);
6676  termlen = rb_enc_mbminlen(enc);
6677  if (sflag) {
6678  int clen, tlen;
6679  long offset, max = RSTRING_LEN(str);
6680  unsigned int save = -1;
6681  char *buf = ALLOC_N(char, max + termlen), *t = buf;
6682 
6683  while (s < send) {
6684  int may_modify = 0;
6685 
6686  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
6687  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
6688 
6689  s += clen;
6690  if (c < 256) {
6691  c = trans[c];
6692  }
6693  else if (hash) {
6694  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
6695  if (NIL_P(tmp)) {
6696  if (cflag) c = last;
6697  else c = errc;
6698  }
6699  else if (cflag) c = errc;
6700  else c = NUM2INT(tmp);
6701  }
6702  else {
6703  c = errc;
6704  }
6705  if (c != (unsigned int)-1) {
6706  if (save == c) {
6707  CHECK_IF_ASCII(c);
6708  continue;
6709  }
6710  save = c;
6711  tlen = rb_enc_codelen(c, enc);
6712  modify = 1;
6713  }
6714  else {
6715  save = -1;
6716  c = c0;
6717  if (enc != e1) may_modify = 1;
6718  }
6719  if ((offset = t - buf) + tlen > max) {
6720  max = offset + tlen + (send - s);
6721  REALLOC_N(buf, char, max + termlen);
6722  t = buf + offset;
6723  }
6724  rb_enc_mbcput(c, t, enc);
6725  if (may_modify && memcmp(s, t, tlen) != 0) {
6726  modify = 1;
6727  }
6728  CHECK_IF_ASCII(c);
6729  t += tlen;
6730  }
6731  if (!STR_EMBED_P(str)) {
6733  }
6734  TERM_FILL(t, termlen);
6735  RSTRING(str)->as.heap.ptr = buf;
6736  RSTRING(str)->as.heap.len = t - buf;
6737  STR_SET_NOEMBED(str);
6738  RSTRING(str)->as.heap.aux.capa = max;
6739  }
6740  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
6741  while (s < send) {
6742  c = (unsigned char)*s;
6743  if (trans[c] != errc) {
6744  if (!cflag) {
6745  c = trans[c];
6746  *s = c;
6747  modify = 1;
6748  }
6749  else {
6750  *s = last;
6751  modify = 1;
6752  }
6753  }
6754  CHECK_IF_ASCII(c);
6755  s++;
6756  }
6757  }
6758  else {
6759  int clen, tlen;
6760  long offset, max = (long)((send - s) * 1.2);
6761  char *buf = ALLOC_N(char, max + termlen), *t = buf;
6762 
6763  while (s < send) {
6764  int may_modify = 0;
6765  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
6766  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
6767 
6768  if (c < 256) {
6769  c = trans[c];
6770  }
6771  else if (hash) {
6772  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
6773  if (NIL_P(tmp)) {
6774  if (cflag) c = last;
6775  else c = errc;
6776  }
6777  else if (cflag) c = errc;
6778  else c = NUM2INT(tmp);
6779  }
6780  else {
6781  c = cflag ? last : errc;
6782  }
6783  if (c != errc) {
6784  tlen = rb_enc_codelen(c, enc);
6785  modify = 1;
6786  }
6787  else {
6788  c = c0;
6789  if (enc != e1) may_modify = 1;
6790  }
6791  if ((offset = t - buf) + tlen > max) {
6792  max = offset + tlen + (long)((send - s) * 1.2);
6793  REALLOC_N(buf, char, max + termlen);
6794  t = buf + offset;
6795  }
6796  if (s != t) {
6797  rb_enc_mbcput(c, t, enc);
6798  if (may_modify && memcmp(s, t, tlen) != 0) {
6799  modify = 1;
6800  }
6801  }
6802  CHECK_IF_ASCII(c);
6803  s += clen;
6804  t += tlen;
6805  }
6806  if (!STR_EMBED_P(str)) {
6808  }
6809  TERM_FILL(t, termlen);
6810  RSTRING(str)->as.heap.ptr = buf;
6811  RSTRING(str)->as.heap.len = t - buf;
6812  STR_SET_NOEMBED(str);
6813  RSTRING(str)->as.heap.aux.capa = max;
6814  }
6815 
6816  if (modify) {
6817  if (cr != ENC_CODERANGE_BROKEN)
6818  ENC_CODERANGE_SET(str, cr);
6819  rb_enc_associate(str, enc);
6820  return str;
6821  }
6822  return Qnil;
6823 }
6824 
6825 
6826 /*
6827  * call-seq:
6828  * str.tr!(from_str, to_str) -> str or nil
6829  *
6830  * Translates <i>str</i> in place, using the same rules as
6831  * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
6832  * changes were made.
6833  */
6834 
6835 static VALUE
6836 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
6837 {
6838  return tr_trans(str, src, repl, 0);
6839 }
6840 
6841 
6842 /*
6843  * call-seq:
6844  * str.tr(from_str, to_str) => new_str
6845  *
6846  * Returns a copy of +str+ with the characters in +from_str+ replaced by the
6847  * corresponding characters in +to_str+. If +to_str+ is shorter than
6848  * +from_str+, it is padded with its last character in order to maintain the
6849  * correspondence.
6850  *
6851  * "hello".tr('el', 'ip') #=> "hippo"
6852  * "hello".tr('aeiou', '*') #=> "h*ll*"
6853  * "hello".tr('aeiou', 'AA*') #=> "hAll*"
6854  *
6855  * Both strings may use the <code>c1-c2</code> notation to denote ranges of
6856  * characters, and +from_str+ may start with a <code>^</code>, which denotes
6857  * all characters except those listed.
6858  *
6859  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
6860  * "hello".tr('^aeiou', '*') #=> "*e**o"
6861  *
6862  * The backslash character <code>\</code> can be used to escape
6863  * <code>^</code> or <code>-</code> and is otherwise ignored unless it
6864  * appears at the end of a range or the end of the +from_str+ or +to_str+:
6865  *
6866  * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
6867  * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
6868  *
6869  * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
6870  * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
6871  * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
6872  *
6873  * "X['\\b']".tr("X\\", "") #=> "['b']"
6874  * "X['\\b']".tr("X-\\]", "") #=> "'b'"
6875  */
6876 
6877 static VALUE
6878 rb_str_tr(VALUE str, VALUE src, VALUE repl)
6879 {
6880  str = rb_str_dup(str);
6881  tr_trans(str, src, repl, 0);
6882  return str;
6883 }
6884 
6885 #define TR_TABLE_SIZE 257
6886 static void
6887 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
6888  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
6889 {
6890  const unsigned int errc = -1;
6891  char buf[256];
6892  struct tr tr;
6893  unsigned int c;
6894  VALUE table = 0, ptable = 0;
6895  int i, l, cflag = 0;
6896 
6897  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
6898  tr.gen = tr.now = tr.max = 0;
6899 
6900  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
6901  cflag = 1;
6902  tr.p += l;
6903  }
6904  if (first) {
6905  for (i=0; i<256; i++) {
6906  stable[i] = 1;
6907  }
6908  stable[256] = cflag;
6909  }
6910  else if (stable[256] && !cflag) {
6911  stable[256] = 0;
6912  }
6913  for (i=0; i<256; i++) {
6914  buf[i] = cflag;
6915  }
6916 
6917  while ((c = trnext(&tr, enc)) != errc) {
6918  if (c < 256) {
6919  buf[c & 0xff] = !cflag;
6920  }
6921  else {
6922  VALUE key = UINT2NUM(c);
6923 
6924  if (!table && (first || *tablep || stable[256])) {
6925  if (cflag) {
6926  ptable = *ctablep;
6927  table = ptable ? ptable : rb_hash_new();
6928  *ctablep = table;
6929  }
6930  else {
6931  table = rb_hash_new();
6932  ptable = *tablep;
6933  *tablep = table;
6934  }
6935  }
6936  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
6937  rb_hash_aset(table, key, Qtrue);
6938  }
6939  }
6940  }
6941  for (i=0; i<256; i++) {
6942  stable[i] = stable[i] && buf[i];
6943  }
6944  if (!table && !cflag) {
6945  *tablep = 0;
6946  }
6947 }
6948 
6949 
6950 static int
6951 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
6952 {
6953  if (c < 256) {
6954  return table[c] != 0;
6955  }
6956  else {
6957  VALUE v = UINT2NUM(c);
6958 
6959  if (del) {
6960  if (!NIL_P(rb_hash_lookup(del, v)) &&
6961  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
6962  return TRUE;
6963  }
6964  }
6965  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
6966  return FALSE;
6967  }
6968  return table[256] ? TRUE : FALSE;
6969  }
6970 }
6971 
6972 /*
6973  * call-seq:
6974  * str.delete!([other_str]+) -> str or nil
6975  *
6976  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
6977  * <code>nil</code> if <i>str</i> was not modified.
6978  */
6979 
6980 static VALUE
6981 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
6982 {
6983  char squeez[TR_TABLE_SIZE];
6984  rb_encoding *enc = 0;
6985  char *s, *send, *t;
6986  VALUE del = 0, nodel = 0;
6987  int modify = 0;
6988  int i, ascompat, cr;
6989 
6990  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6992  for (i=0; i<argc; i++) {
6993  VALUE s = argv[i];
6994 
6995  StringValue(s);
6996  enc = rb_enc_check(str, s);
6997  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
6998  }
6999 
7000  str_modify_keep_cr(str);
7001  ascompat = rb_enc_asciicompat(enc);
7002  s = t = RSTRING_PTR(str);
7003  send = RSTRING_END(str);
7004  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7005  while (s < send) {
7006  unsigned int c;
7007  int clen;
7008 
7009  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7010  if (squeez[c]) {
7011  modify = 1;
7012  }
7013  else {
7014  if (t != s) *t = c;
7015  t++;
7016  }
7017  s++;
7018  }
7019  else {
7020  c = rb_enc_codepoint_len(s, send, &clen, enc);
7021 
7022  if (tr_find(c, squeez, del, nodel)) {
7023  modify = 1;
7024  }
7025  else {
7026  if (t != s) rb_enc_mbcput(c, t, enc);
7027  t += clen;
7028  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
7029  }
7030  s += clen;
7031  }
7032  }
7033  TERM_FILL(t, TERM_LEN(str));
7034  STR_SET_LEN(str, t - RSTRING_PTR(str));
7035  ENC_CODERANGE_SET(str, cr);
7036 
7037  if (modify) return str;
7038  return Qnil;
7039 }
7040 
7041 
7042 /*
7043  * call-seq:
7044  * str.delete([other_str]+) -> new_str
7045  *
7046  * Returns a copy of <i>str</i> with all characters in the intersection of its
7047  * arguments deleted. Uses the same rules for building the set of characters as
7048  * <code>String#count</code>.
7049  *
7050  * "hello".delete "l","lo" #=> "heo"
7051  * "hello".delete "lo" #=> "he"
7052  * "hello".delete "aeiou", "^e" #=> "hell"
7053  * "hello".delete "ej-m" #=> "ho"
7054  */
7055 
7056 static VALUE
7057 rb_str_delete(int argc, VALUE *argv, VALUE str)
7058 {
7059  str = rb_str_dup(str);
7060  rb_str_delete_bang(argc, argv, str);
7061  return str;
7062 }
7063 
7064 
7065 /*
7066  * call-seq:
7067  * str.squeeze!([other_str]*) -> str or nil
7068  *
7069  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
7070  * <code>nil</code> if no changes were made.
7071  */
7072 
7073 static VALUE
7074 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
7075 {
7076  char squeez[TR_TABLE_SIZE];
7077  rb_encoding *enc = 0;
7078  VALUE del = 0, nodel = 0;
7079  char *s, *send, *t;
7080  int i, modify = 0;
7081  int ascompat, singlebyte = single_byte_optimizable(str);
7082  unsigned int save;
7083 
7084  if (argc == 0) {
7085  enc = STR_ENC_GET(str);
7086  }
7087  else {
7088  for (i=0; i<argc; i++) {
7089  VALUE s = argv[i];
7090 
7091  StringValue(s);
7092  enc = rb_enc_check(str, s);
7093  if (singlebyte && !single_byte_optimizable(s))
7094  singlebyte = 0;
7095  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7096  }
7097  }
7098 
7099  str_modify_keep_cr(str);
7100  s = t = RSTRING_PTR(str);
7101  if (!s || RSTRING_LEN(str) == 0) return Qnil;
7102  send = RSTRING_END(str);
7103  save = -1;
7104  ascompat = rb_enc_asciicompat(enc);
7105 
7106  if (singlebyte) {
7107  while (s < send) {
7108  unsigned int c = *(unsigned char*)s++;
7109  if (c != save || (argc > 0 && !squeez[c])) {
7110  *t++ = save = c;
7111  }
7112  }
7113  }
7114  else {
7115  while (s < send) {
7116  unsigned int c;
7117  int clen;
7118 
7119  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7120  if (c != save || (argc > 0 && !squeez[c])) {
7121  *t++ = save = c;
7122  }
7123  s++;
7124  }
7125  else {
7126  c = rb_enc_codepoint_len(s, send, &clen, enc);
7127 
7128  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
7129  if (t != s) rb_enc_mbcput(c, t, enc);
7130  save = c;
7131  t += clen;
7132  }
7133  s += clen;
7134  }
7135  }
7136  }
7137 
7138  TERM_FILL(t, TERM_LEN(str));
7139  if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
7140  STR_SET_LEN(str, t - RSTRING_PTR(str));
7141  modify = 1;
7142  }
7143 
7144  if (modify) return str;
7145  return Qnil;
7146 }
7147 
7148 
7149 /*
7150  * call-seq:
7151  * str.squeeze([other_str]*) -> new_str
7152  *
7153  * Builds a set of characters from the <i>other_str</i> parameter(s) using the
7154  * procedure described for <code>String#count</code>. Returns a new string
7155  * where runs of the same character that occur in this set are replaced by a
7156  * single character. If no arguments are given, all runs of identical
7157  * characters are replaced by a single character.
7158  *
7159  * "yellow moon".squeeze #=> "yelow mon"
7160  * " now is the".squeeze(" ") #=> " now is the"
7161  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
7162  */
7163 
7164 static VALUE
7165 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
7166 {
7167  str = rb_str_dup(str);
7168  rb_str_squeeze_bang(argc, argv, str);
7169  return str;
7170 }
7171 
7172 
7173 /*
7174  * call-seq:
7175  * str.tr_s!(from_str, to_str) -> str or nil
7176  *
7177  * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
7178  * returning <i>str</i>, or <code>nil</code> if no changes were made.
7179  */
7180 
7181 static VALUE
7182 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
7183 {
7184  return tr_trans(str, src, repl, 1);
7185 }
7186 
7187 
7188 /*
7189  * call-seq:
7190  * str.tr_s(from_str, to_str) -> new_str
7191  *
7192  * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
7193  * then removes duplicate characters in regions that were affected by the
7194  * translation.
7195  *
7196  * "hello".tr_s('l', 'r') #=> "hero"
7197  * "hello".tr_s('el', '*') #=> "h*o"
7198  * "hello".tr_s('el', 'hx') #=> "hhxo"
7199  */
7200 
7201 static VALUE
7202 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7203 {
7204  str = rb_str_dup(str);
7205  tr_trans(str, src, repl, 1);
7206  return str;
7207 }
7208 
7209 
7210 /*
7211  * call-seq:
7212  * str.count([other_str]+) -> integer
7213  *
7214  * Each +other_str+ parameter defines a set of characters to count. The
7215  * intersection of these sets defines the characters to count in +str+. Any
7216  * +other_str+ that starts with a caret <code>^</code> is negated. The
7217  * sequence <code>c1-c2</code> means all characters between c1 and c2. The
7218  * backslash character <code>\</code> can be used to escape <code>^</code> or
7219  * <code>-</code> and is otherwise ignored unless it appears at the end of a
7220  * sequence or the end of a +other_str+.
7221  *
7222  * a = "hello world"
7223  * a.count "lo" #=> 5
7224  * a.count "lo", "o" #=> 2
7225  * a.count "hello", "^l" #=> 4
7226  * a.count "ej-m" #=> 4
7227  *
7228  * "hello^world".count "\\^aeiou" #=> 4
7229  * "hello-world".count "a\\-eo" #=> 4
7230  *
7231  * c = "hello world\\r\\n"
7232  * c.count "\\" #=> 2
7233  * c.count "\\A" #=> 0
7234  * c.count "X-\\w" #=> 3
7235  */
7236 
7237 static VALUE
7238 rb_str_count(int argc, VALUE *argv, VALUE str)
7239 {
7240  char table[TR_TABLE_SIZE];
7241  rb_encoding *enc = 0;
7242  VALUE del = 0, nodel = 0, tstr;
7243  char *s, *send;
7244  int i;
7245  int ascompat;
7246 
7248 
7249  tstr = argv[0];
7250  StringValue(tstr);
7251  enc = rb_enc_check(str, tstr);
7252  if (argc == 1) {
7253  const char *ptstr;
7254  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7255  (ptstr = RSTRING_PTR(tstr),
7256  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7257  !is_broken_string(str)) {
7258  int n = 0;
7259  int clen;
7260  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
7261 
7262  s = RSTRING_PTR(str);
7263  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7264  send = RSTRING_END(str);
7265  while (s < send) {
7266  if (*(unsigned char*)s++ == c) n++;
7267  }
7268  return INT2NUM(n);
7269  }
7270  }
7271 
7272  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
7273  for (i=1; i<argc; i++) {
7274  tstr = argv[i];
7275  StringValue(tstr);
7276  enc = rb_enc_check(str, tstr);
7277  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
7278  }
7279 
7280  s = RSTRING_PTR(str);
7281  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7282  send = RSTRING_END(str);
7283  ascompat = rb_enc_asciicompat(enc);
7284  i = 0;
7285  while (s < send) {
7286  unsigned int c;
7287 
7288  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7289  if (table[c]) {
7290  i++;
7291  }
7292  s++;
7293  }
7294  else {
7295  int clen;
7296  c = rb_enc_codepoint_len(s, send, &clen, enc);
7297  if (tr_find(c, table, del, nodel)) {
7298  i++;
7299  }
7300  s += clen;
7301  }
7302  }
7303 
7304  return INT2NUM(i);
7305 }
7306 
7307 static VALUE
7308 rb_fs_check(VALUE val)
7309 {
7310  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
7311  val = rb_check_string_type(val);
7312  if (NIL_P(val)) return 0;
7313  }
7314  return val;
7315 }
7316 
7317 static const char isspacetable[256] = {
7318  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
7319  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7320  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7321  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7322  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7323  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7324  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7325  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7326  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7327  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7328  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7329  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7330  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7331  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7332  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7333  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
7334 };
7335 
7336 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
7337 
7338 /*
7339  * call-seq:
7340  * str.split(pattern=nil, [limit]) -> an_array
7341  *
7342  * Divides <i>str</i> into substrings based on a delimiter, returning an array
7343  * of these substrings.
7344  *
7345  * If <i>pattern</i> is a <code>String</code>, then its contents are used as
7346  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
7347  * space, <i>str</i> is split on whitespace, with leading whitespace and runs
7348  * of contiguous whitespace characters ignored.
7349  *
7350  * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
7351  * pattern matches. Whenever the pattern matches a zero-length string,
7352  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
7353  * groups, the respective matches will be returned in the array as well.
7354  *
7355  * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
7356  * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
7357  * split on whitespace as if ' ' were specified.
7358  *
7359  * If the <i>limit</i> parameter is omitted, trailing null fields are
7360  * suppressed. If <i>limit</i> is a positive number, at most that number
7361  * of split substrings will be returned (captured groups will be returned
7362  * as well, but are not counted towards the limit).
7363  * If <i>limit</i> is <code>1</code>, the entire
7364  * string is returned as the only entry in an array. If negative, there is no
7365  * limit to the number of fields returned, and trailing null fields are not
7366  * suppressed.
7367  *
7368  * When the input +str+ is empty an empty Array is returned as the string is
7369  * considered to have no fields to split.
7370  *
7371  * " now's the time".split #=> ["now's", "the", "time"]
7372  * " now's the time".split(' ') #=> ["now's", "the", "time"]
7373  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
7374  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
7375  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
7376  * "hello".split(//, 3) #=> ["h", "e", "llo"]
7377  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
7378  *
7379  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
7380  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
7381  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
7382  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
7383  *
7384  * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
7385  *
7386  * "".split(',', -1) #=> []
7387  */
7388 
7389 static VALUE
7390 rb_str_split_m(int argc, VALUE *argv, VALUE str)
7391 {
7392  rb_encoding *enc;
7393  VALUE spat;
7394  VALUE limit;
7395  enum {awk, string, regexp} split_type;
7396  long beg, end, i = 0;
7397  int lim = 0;
7398  VALUE result, tmp;
7399 
7400  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
7401  lim = NUM2INT(limit);
7402  if (lim <= 0) limit = Qnil;
7403  else if (lim == 1) {
7404  if (RSTRING_LEN(str) == 0)
7405  return rb_ary_new2(0);
7406  return rb_ary_new3(1, rb_str_dup(str));
7407  }
7408  i = 1;
7409  }
7410 
7411  enc = STR_ENC_GET(str);
7412  split_type = regexp;
7413  if (!NIL_P(spat)) {
7414  spat = get_pat_quoted(spat, 0);
7415  }
7416  else if (NIL_P(spat = rb_fs)) {
7417  split_type = awk;
7418  }
7419  else if (!(spat = rb_fs_check(spat))) {
7420  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
7421  }
7422  if (split_type != awk) {
7423  if (BUILTIN_TYPE(spat) == T_STRING) {
7424  rb_encoding *enc2 = STR_ENC_GET(spat);
7425 
7426  mustnot_broken(spat);
7427  split_type = string;
7428  if (RSTRING_LEN(spat) == 0) {
7429  /* Special case - split into chars */
7430  spat = rb_reg_regcomp(spat);
7431  split_type = regexp;
7432  }
7433  else if (rb_enc_asciicompat(enc2) == 1) {
7434  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
7435  split_type = awk;
7436  }
7437  }
7438  else {
7439  int l;
7440  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
7441  RSTRING_LEN(spat) == l) {
7442  split_type = awk;
7443  }
7444  }
7445  }
7446  }
7447 
7448  result = rb_ary_new();
7449  beg = 0;
7450  if (split_type == awk) {
7451  char *ptr = RSTRING_PTR(str);
7452  char *eptr = RSTRING_END(str);
7453  char *bptr = ptr;
7454  int skip = 1;
7455  unsigned int c;
7456 
7457  end = beg;
7458  if (is_ascii_string(str)) {
7459  while (ptr < eptr) {
7460  c = (unsigned char)*ptr++;
7461  if (skip) {
7462  if (ascii_isspace(c)) {
7463  beg = ptr - bptr;
7464  }
7465  else {
7466  end = ptr - bptr;
7467  skip = 0;
7468  if (!NIL_P(limit) && lim <= i) break;
7469  }
7470  }
7471  else if (ascii_isspace(c)) {
7472  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
7473  skip = 1;
7474  beg = ptr - bptr;
7475  if (!NIL_P(limit)) ++i;
7476  }
7477  else {
7478  end = ptr - bptr;
7479  }
7480  }
7481  }
7482  else {
7483  while (ptr < eptr) {
7484  int n;
7485 
7486  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
7487  ptr += n;
7488  if (skip) {
7489  if (rb_isspace(c)) {
7490  beg = ptr - bptr;
7491  }
7492  else {
7493  end = ptr - bptr;
7494  skip = 0;
7495  if (!NIL_P(limit) && lim <= i) break;
7496  }
7497  }
7498  else if (rb_isspace(c)) {
7499  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
7500  skip = 1;
7501  beg = ptr - bptr;
7502  if (!NIL_P(limit)) ++i;
7503  }
7504  else {
7505  end = ptr - bptr;
7506  }
7507  }
7508  }
7509  }
7510  else if (split_type == string) {
7511  char *ptr = RSTRING_PTR(str);
7512  char *str_start = ptr;
7513  char *substr_start = ptr;
7514  char *eptr = RSTRING_END(str);
7515  char *sptr = RSTRING_PTR(spat);
7516  long slen = RSTRING_LEN(spat);
7517 
7518  mustnot_broken(str);
7519  enc = rb_enc_check(str, spat);
7520  while (ptr < eptr &&
7521  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
7522  /* Check we are at the start of a char */
7523  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
7524  if (t != ptr + end) {
7525  ptr = t;
7526  continue;
7527  }
7528  rb_ary_push(result, rb_str_subseq(str, substr_start - str_start,
7529  (ptr+end) - substr_start));
7530  ptr += end + slen;
7531  substr_start = ptr;
7532  if (!NIL_P(limit) && lim <= ++i) break;
7533  }
7534  beg = ptr - str_start;
7535  }
7536  else {
7537  char *ptr = RSTRING_PTR(str);
7538  long len = RSTRING_LEN(str);
7539  long start = beg;
7540  long idx;
7541  int last_null = 0;
7542  struct re_registers *regs;
7543 
7544  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
7545  regs = RMATCH_REGS(rb_backref_get());
7546  if (start == end && BEG(0) == END(0)) {
7547  if (!ptr) {
7548  rb_ary_push(result, str_new_empty(str));
7549  break;
7550  }
7551  else if (last_null == 1) {
7552  rb_ary_push(result, rb_str_subseq(str, beg,
7553  rb_enc_fast_mbclen(ptr+beg,
7554  ptr+len,
7555  enc)));
7556  beg = start;
7557  }
7558  else {
7559  if (start == len)
7560  start++;
7561  else
7562  start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
7563  last_null = 1;
7564  continue;
7565  }
7566  }
7567  else {
7568  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
7569  beg = start = END(0);
7570  }
7571  last_null = 0;
7572 
7573  for (idx=1; idx < regs->num_regs; idx++) {
7574  if (BEG(idx) == -1) continue;
7575  if (BEG(idx) == END(idx))
7576  tmp = str_new_empty(str);
7577  else
7578  tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
7579  rb_ary_push(result, tmp);
7580  }
7581  if (!NIL_P(limit) && lim <= ++i) break;
7582  }
7583  }
7584  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
7585  if (RSTRING_LEN(str) == beg)
7586  tmp = str_new_empty(str);
7587  else
7588  tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
7589  rb_ary_push(result, tmp);
7590  }
7591  if (NIL_P(limit) && lim == 0) {
7592  long len;
7593  while ((len = RARRAY_LEN(result)) > 0 &&
7594  (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
7595  rb_ary_pop(result);
7596  }
7597 
7598  return result;
7599 }
7600 
7601 VALUE
7602 rb_str_split(VALUE str, const char *sep0)
7603 {
7604  VALUE sep;
7605 
7606  StringValue(str);
7607  sep = rb_str_new_cstr(sep0);
7608  return rb_str_split_m(1, &sep, str);
7609 }
7610 
7611 static int
7612 enumerator_wantarray(const char *method)
7613 {
7614  if (rb_block_given_p()) {
7615 #if STRING_ENUMERATORS_WANTARRAY
7616  rb_warn("given block not used");
7617 #else
7618  rb_warning("passing a block to String#%s is deprecated", method);
7619  return 0;
7620 #endif
7621  }
7622  return 1;
7623 }
7624 
7625 #define WANTARRAY(m, size) \
7626  (enumerator_wantarray(m) ? rb_ary_new_capa(size) : 0)
7627 
7628 static inline int
7629 enumerator_element(VALUE ary, VALUE e)
7630 {
7631  if (ary) {
7632  rb_ary_push(ary, e);
7633  return 0;
7634  }
7635  else {
7636  rb_yield(e);
7637  return 1;
7638  }
7639 }
7640 
7641 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
7642 
7643 static const char *
7644 chomp_newline(const char *p, const char *e, rb_encoding *enc)
7645 {
7646  const char *prev = rb_enc_prev_char(p, e, e, enc);
7647  if (rb_enc_is_newline(prev, e, enc)) {
7648  e = prev;
7649  prev = rb_enc_prev_char(p, e, e, enc);
7650  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
7651  e = prev;
7652  }
7653  return e;
7654 }
7655 
7656 static VALUE
7657 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
7658 {
7659  rb_encoding *enc;
7660  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
7661  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
7662  long pos, len, rslen;
7663  int rsnewline = 0;
7664 
7665  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
7666  rs = rb_rs;
7667  if (!NIL_P(opts)) {
7668  static ID keywords[1];
7669  if (!keywords[0]) {
7670  keywords[0] = rb_intern_const("chomp");
7671  }
7672  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
7673  chomp = (chomp != Qundef && RTEST(chomp));
7674  }
7675 
7676  if (NIL_P(rs)) {
7677  if (!ENUM_ELEM(ary, str)) {
7678  return ary;
7679  }
7680  else {
7681  return orig;
7682  }
7683  }
7684 
7685  if (!RSTRING_LEN(str)) goto end;
7686  str = rb_str_new_frozen(str);
7687  ptr = subptr = RSTRING_PTR(str);
7688  pend = RSTRING_END(str);
7689  len = RSTRING_LEN(str);
7690  StringValue(rs);
7691  rslen = RSTRING_LEN(rs);
7692 
7693  if (rs == rb_default_rs)
7694  enc = rb_enc_get(str);
7695  else
7696  enc = rb_enc_check(str, rs);
7697 
7698  if (rslen == 0) {
7699  /* paragraph mode */
7700  int n;
7701  const char *eol = NULL;
7702  subend = subptr;
7703  while (subend < pend) {
7704  do {
7705  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
7706  n = 0;
7707  rslen = n + rb_enc_mbclen(subend + n, pend, enc);
7708  if (rb_enc_is_newline(subend + n, pend, enc)) {
7709  if (eol == subend) break;
7710  subend += rslen;
7711  if (subptr) eol = subend;
7712  }
7713  else {
7714  if (!subptr) subptr = subend;
7715  subend += rslen;
7716  }
7717  rslen = 0;
7718  } while (subend < pend);
7719  if (!subptr) break;
7720  line = rb_str_subseq(str, subptr - ptr,
7721  subend - subptr + (chomp ? 0 : rslen));
7722  if (ENUM_ELEM(ary, line)) {
7723  str_mod_check(str, ptr, len);
7724  }
7725  subptr = eol = NULL;
7726  }
7727  goto end;
7728  }
7729  else {
7730  rsptr = RSTRING_PTR(rs);
7731  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
7732  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
7733  rsnewline = 1;
7734  }
7735  }
7736 
7737  if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
7738  rs = rb_str_new(rsptr, rslen);
7739  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
7740  rsptr = RSTRING_PTR(rs);
7741  rslen = RSTRING_LEN(rs);
7742  }
7743 
7744  while (subptr < pend) {
7745  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
7746  if (pos < 0) break;
7747  hit = subptr + pos;
7748  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
7749  if (hit != adjusted) {
7750  subptr = adjusted;
7751  continue;
7752  }
7753  subend = hit += rslen;
7754  if (chomp) {
7755  if (rsnewline) {
7756  subend = chomp_newline(subptr, subend, enc);
7757  }
7758  else {
7759  subend -= rslen;
7760  }
7761  }
7762  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
7763  if (ENUM_ELEM(ary, line)) {
7764  str_mod_check(str, ptr, len);
7765  }
7766  subptr = hit;
7767  }
7768 
7769  if (subptr != pend) {
7770  if (chomp) {
7771  pend = chomp_newline(subptr, pend, enc);
7772  }
7773  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
7774  ENUM_ELEM(ary, line);
7775  RB_GC_GUARD(str);
7776  }
7777 
7778  end:
7779  if (ary)
7780  return ary;
7781  else
7782  return orig;
7783 }
7784 
7785 /*
7786  * call-seq:
7787  * str.each_line(separator=$/ [, getline_args]) {|substr| block } -> str
7788  * str.each_line(separator=$/ [, getline_args]) -> an_enumerator
7789  *
7790  * Splits <i>str</i> using the supplied parameter as the record
7791  * separator (<code>$/</code> by default), passing each substring in
7792  * turn to the supplied block. If a zero-length record separator is
7793  * supplied, the string is split into paragraphs delimited by
7794  * multiple successive newlines.
7795  *
7796  * See <code>IO.readlines</code> for detail about getline_args.
7797  *
7798  * If no block is given, an enumerator is returned instead.
7799  *
7800  * print "Example one\n"
7801  * "hello\nworld".each_line {|s| p s}
7802  * print "Example two\n"
7803  * "hello\nworld".each_line('l') {|s| p s}
7804  * print "Example three\n"
7805  * "hello\n\n\nworld".each_line('') {|s| p s}
7806  *
7807  * <em>produces:</em>
7808  *
7809  * Example one
7810  * "hello\n"
7811  * "world"
7812  * Example two
7813  * "hel"
7814  * "l"
7815  * "o\nworl"
7816  * "d"
7817  * Example three
7818  * "hello\n\n"
7819  * "world"
7820  */
7821 
7822 static VALUE
7823 rb_str_each_line(int argc, VALUE *argv, VALUE str)
7824 {
7825  RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
7826  return rb_str_enumerate_lines(argc, argv, str, 0);
7827 }
7828 
7829 /*
7830  * call-seq:
7831  * str.lines(separator=$/) -> an_array
7832  *
7833  * Returns an array of lines in <i>str</i> split using the supplied
7834  * record separator (<code>$/</code> by default). This is a
7835  * shorthand for <code>str.each_line(separator).to_a</code>.
7836  *
7837  * If a block is given, which is a deprecated form, works the same as
7838  * <code>each_line</code>.
7839  */
7840 
7841 static VALUE
7842 rb_str_lines(int argc, VALUE *argv, VALUE str)
7843 {
7844  VALUE ary = WANTARRAY("lines", 0);
7845  return rb_str_enumerate_lines(argc, argv, str, ary);
7846 }
7847 
7848 static VALUE
7849 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
7850 {
7851  return LONG2FIX(RSTRING_LEN(str));
7852 }
7853 
7854 static VALUE
7855 rb_str_enumerate_bytes(VALUE str, VALUE ary)
7856 {
7857  long i;
7858 
7859  for (i=0; i<RSTRING_LEN(str); i++) {
7860  ENUM_ELEM(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
7861  }
7862  if (ary)
7863  return ary;
7864  else
7865  return str;
7866 }
7867 
7868 /*
7869  * call-seq:
7870  * str.each_byte {|integer| block } -> str
7871  * str.each_byte -> an_enumerator
7872  *
7873  * Passes each byte in <i>str</i> to the given block, or returns an
7874  * enumerator if no block is given.
7875  *
7876  * "hello".each_byte {|c| print c, ' ' }
7877  *
7878  * <em>produces:</em>
7879  *
7880  * 104 101 108 108 111
7881  */
7882 
7883 static VALUE
7884 rb_str_each_byte(VALUE str)
7885 {
7886  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
7887  return rb_str_enumerate_bytes(str, 0);
7888 }
7889 
7890 /*
7891  * call-seq:
7892  * str.bytes -> an_array
7893  *
7894  * Returns an array of bytes in <i>str</i>. This is a shorthand for
7895  * <code>str.each_byte.to_a</code>.
7896  *
7897  * If a block is given, which is a deprecated form, works the same as
7898  * <code>each_byte</code>.
7899  */
7900 
7901 static VALUE
7902 rb_str_bytes(VALUE str)
7903 {
7904  VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
7905  return rb_str_enumerate_bytes(str, ary);
7906 }
7907 
7908 static VALUE
7909 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
7910 {
7911  return rb_str_length(str);
7912 }
7913 
7914 static VALUE
7915 rb_str_enumerate_chars(VALUE str, VALUE ary)
7916 {
7917  VALUE orig = str;
7918  long i, len, n;
7919  const char *ptr;
7920  rb_encoding *enc;
7921 
7922  str = rb_str_new_frozen(str);
7923  ptr = RSTRING_PTR(str);
7924  len = RSTRING_LEN(str);
7925  enc = rb_enc_get(str);
7926 
7928  for (i = 0; i < len; i += n) {
7929  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
7930  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
7931  }
7932  }
7933  else {
7934  for (i = 0; i < len; i += n) {
7935  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
7936  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
7937  }
7938  }
7939  RB_GC_GUARD(str);
7940  if (ary)
7941  return ary;
7942  else
7943  return orig;
7944 }
7945 
7946 /*
7947  * call-seq:
7948  * str.each_char {|cstr| block } -> str
7949  * str.each_char -> an_enumerator
7950  *
7951  * Passes each character in <i>str</i> to the given block, or returns
7952  * an enumerator if no block is given.
7953  *
7954  * "hello".each_char {|c| print c, ' ' }
7955  *
7956  * <em>produces:</em>
7957  *
7958  * h e l l o
7959  */
7960 
7961 static VALUE
7962 rb_str_each_char(VALUE str)
7963 {
7964  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
7965  return rb_str_enumerate_chars(str, 0);
7966 }
7967 
7968 /*
7969  * call-seq:
7970  * str.chars -> an_array
7971  *
7972  * Returns an array of characters in <i>str</i>. This is a shorthand
7973  * for <code>str.each_char.to_a</code>.
7974  *
7975  * If a block is given, which is a deprecated form, works the same as
7976  * <code>each_char</code>.
7977  */
7978 
7979 static VALUE
7980 rb_str_chars(VALUE str)
7981 {
7982  VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
7983  return rb_str_enumerate_chars(str, ary);
7984 }
7985 
7986 static VALUE
7987 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
7988 {
7989  VALUE orig = str;
7990  int n;
7991  unsigned int c;
7992  const char *ptr, *end;
7993  rb_encoding *enc;
7994 
7995  if (single_byte_optimizable(str))
7996  return rb_str_enumerate_bytes(str, ary);
7997 
7998  str = rb_str_new_frozen(str);
7999  ptr = RSTRING_PTR(str);
8000  end = RSTRING_END(str);
8001  enc = STR_ENC_GET(str);
8002 
8003  while (ptr < end) {
8004  c = rb_enc_codepoint_len(ptr, end, &n, enc);
8005  ENUM_ELEM(ary, UINT2NUM(c));
8006  ptr += n;
8007  }
8008  RB_GC_GUARD(str);
8009  if (ary)
8010  return ary;
8011  else
8012  return orig;
8013 }
8014 
8015 /*
8016  * call-seq:
8017  * str.each_codepoint {|integer| block } -> str
8018  * str.each_codepoint -> an_enumerator
8019  *
8020  * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
8021  * also known as a <i>codepoint</i> when applied to Unicode strings to the
8022  * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
8023  * values are directly derived from the binary representation
8024  * of each character.
8025  *
8026  * If no block is given, an enumerator is returned instead.
8027  *
8028  * "hello\u0639".each_codepoint {|c| print c, ' ' }
8029  *
8030  * <em>produces:</em>
8031  *
8032  * 104 101 108 108 111 1593
8033  */
8034 
8035 static VALUE
8036 rb_str_each_codepoint(VALUE str)
8037 {
8038  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8039  return rb_str_enumerate_codepoints(str, 0);
8040 }
8041 
8042 /*
8043  * call-seq:
8044  * str.codepoints -> an_array
8045  *
8046  * Returns an array of the <code>Integer</code> ordinals of the
8047  * characters in <i>str</i>. This is a shorthand for
8048  * <code>str.each_codepoint.to_a</code>.
8049  *
8050  * If a block is given, which is a deprecated form, works the same as
8051  * <code>each_codepoint</code>.
8052  */
8053 
8054 static VALUE
8055 rb_str_codepoints(VALUE str)
8056 {
8057  VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
8058  return rb_str_enumerate_codepoints(str, ary);
8059 }
8060 
8061 static VALUE
8062 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8063 {
8064  VALUE orig = str;
8065  regex_t *reg_grapheme_cluster = NULL;
8066  static regex_t *reg_grapheme_cluster_utf8 = NULL;
8067  int encidx = ENCODING_GET(str);
8068  rb_encoding *enc = rb_enc_from_index(encidx);
8069  int unicode_p = rb_enc_unicode_p(enc);
8070  const char *ptr, *end;
8071 
8072  if (!unicode_p || single_byte_optimizable(str)) {
8073  return rb_str_enumerate_chars(str, ary);
8074  }
8075 
8076  /* synchronize */
8077  if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8078  reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8079  }
8080  if (!reg_grapheme_cluster) {
8081  const OnigUChar source[] = "\\X";
8082  int r = onig_new(&reg_grapheme_cluster, source, source + sizeof(source) - 1,
8084  if (r) {
8085  rb_bug("cannot compile grapheme cluster regexp");
8086  }
8087  if (encidx == rb_utf8_encindex()) {
8088  reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8089  }
8090  }
8091 
8092  if (!ary) str = rb_str_new_frozen(str);
8093  ptr = RSTRING_PTR(str);
8094  end = RSTRING_END(str);
8095 
8096  while (ptr < end) {
8097  OnigPosition len = onig_match(reg_grapheme_cluster,
8098  (const OnigUChar *)ptr, (const OnigUChar *)end,
8099  (const OnigUChar *)ptr, NULL, 0);
8100  if (len == 0) break;
8101  if (len < 0) {
8102  break;
8103  }
8104  ENUM_ELEM(ary, rb_enc_str_new(ptr, len, enc));
8105  ptr += len;
8106  }
8107  RB_GC_GUARD(str);
8108  if (ary)
8109  return ary;
8110  else
8111  return orig;
8112 }
8113 
8114 /*
8115  * call-seq:
8116  * str.each_grapheme_cluster {|cstr| block } -> str
8117  * str.each_grapheme_cluster -> an_enumerator
8118  *
8119  * Passes each grapheme cluster in <i>str</i> to the given block, or returns
8120  * an enumerator if no block is given.
8121  * Unlike String#each_char, this enumerates by grapheme clusters defined by
8122  * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
8123  *
8124  * "a\u0300".each_char.to_a.size #=> 2
8125  * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
8126  *
8127  */
8128 
8129 static VALUE
8130 rb_str_each_grapheme_cluster(VALUE str)
8131 {
8132  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8133  return rb_str_enumerate_grapheme_clusters(str, 0);
8134 }
8135 
8136 /*
8137  * call-seq:
8138  * str.grapheme_clusters -> an_array
8139  *
8140  * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
8141  * for <code>str.each_grapheme_cluster.to_a</code>.
8142  *
8143  * If a block is given, which is a deprecated form, works the same as
8144  * <code>each_grapheme_cluster</code>.
8145  */
8146 
8147 static VALUE
8148 rb_str_grapheme_clusters(VALUE str)
8149 {
8150  VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
8151  return rb_str_enumerate_grapheme_clusters(str, ary);
8152 }
8153 
8154 static long
8155 chopped_length(VALUE str)
8156 {
8157  rb_encoding *enc = STR_ENC_GET(str);
8158  const char *p, *p2, *beg, *end;
8159 
8160  beg = RSTRING_PTR(str);
8161  end = beg + RSTRING_LEN(str);
8162  if (beg > end) return 0;
8163  p = rb_enc_prev_char(beg, end, end, enc);
8164  if (!p) return 0;
8165  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
8166  p2 = rb_enc_prev_char(beg, p, end, enc);
8167  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
8168  }
8169  return p - beg;
8170 }
8171 
8172 /*
8173  * call-seq:
8174  * str.chop! -> str or nil
8175  *
8176  * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
8177  * or <code>nil</code> if <i>str</i> is the empty string. See also
8178  * <code>String#chomp!</code>.
8179  */
8180 
8181 static VALUE
8182 rb_str_chop_bang(VALUE str)
8183 {
8184  str_modify_keep_cr(str);
8185  if (RSTRING_LEN(str) > 0) {
8186  long len;
8187  len = chopped_length(str);
8188  STR_SET_LEN(str, len);
8189  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8190  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8191  ENC_CODERANGE_CLEAR(str);
8192  }
8193  return str;
8194  }
8195  return Qnil;
8196 }
8197 
8198 
8199 /*
8200  * call-seq:
8201  * str.chop -> new_str
8202  *
8203  * Returns a new <code>String</code> with the last character removed. If the
8204  * string ends with <code>\r\n</code>, both characters are removed. Applying
8205  * <code>chop</code> to an empty string returns an empty
8206  * string. <code>String#chomp</code> is often a safer alternative, as it leaves
8207  * the string unchanged if it doesn't end in a record separator.
8208  *
8209  * "string\r\n".chop #=> "string"
8210  * "string\n\r".chop #=> "string\n"
8211  * "string\n".chop #=> "string"
8212  * "string".chop #=> "strin"
8213  * "x".chop.chop #=> ""
8214  */
8215 
8216 static VALUE
8217 rb_str_chop(VALUE str)
8218 {
8219  return rb_str_subseq(str, 0, chopped_length(str));
8220 }
8221 
8222 
8223 static long
8224 chompped_length(VALUE str, VALUE rs)
8225 {
8226  rb_encoding *enc;
8227  int newline;
8228  char *pp, *e, *rsptr;
8229  long rslen;
8230  char *const p = RSTRING_PTR(str);
8231  long len = RSTRING_LEN(str);
8232 
8233  if (len == 0) return 0;
8234  e = p + len;
8235  if (rs == rb_default_rs) {
8236  smart_chomp:
8237  enc = rb_enc_get(str);
8238  if (rb_enc_mbminlen(enc) > 1) {
8239  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8240  if (rb_enc_is_newline(pp, e, enc)) {
8241  e = pp;
8242  }
8243  pp = e - rb_enc_mbminlen(enc);
8244  if (pp >= p) {
8245  pp = rb_enc_left_char_head(p, pp, e, enc);
8246  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8247  e = pp;
8248  }
8249  }
8250  }
8251  else {
8252  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
8253  case '\n':
8254  if (--e > p && *(e-1) == '\r') {
8255  --e;
8256  }
8257  break;
8258  case '\r':
8259  --e;
8260  break;
8261  }
8262  }
8263  return e - p;
8264  }
8265 
8266  enc = rb_enc_get(str);
8267  RSTRING_GETMEM(rs, rsptr, rslen);
8268  if (rslen == 0) {
8269  if (rb_enc_mbminlen(enc) > 1) {
8270  while (e > p) {
8271  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8272  if (!rb_enc_is_newline(pp, e, enc)) break;
8273  e = pp;
8274  pp -= rb_enc_mbminlen(enc);
8275  if (pp >= p) {
8276  pp = rb_enc_left_char_head(p, pp, e, enc);
8277  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8278  e = pp;
8279  }
8280  }
8281  }
8282  }
8283  else {
8284  while (e > p && *(e-1) == '\n') {
8285  --e;
8286  if (e > p && *(e-1) == '\r')
8287  --e;
8288  }
8289  }
8290  return e - p;
8291  }
8292  if (rslen > len) return len;
8293 
8294  enc = rb_enc_get(rs);
8295  newline = rsptr[rslen-1];
8296  if (rslen == rb_enc_mbminlen(enc)) {
8297  if (rslen == 1) {
8298  if (newline == '\n')
8299  goto smart_chomp;
8300  }
8301  else {
8302  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
8303  goto smart_chomp;
8304  }
8305  }
8306 
8307  enc = rb_enc_check(str, rs);
8308  if (is_broken_string(rs)) {
8309  return len;
8310  }
8311  pp = e - rslen;
8312  if (p[len-1] == newline &&
8313  (rslen <= 1 ||
8314  memcmp(rsptr, pp, rslen) == 0)) {
8315  if (rb_enc_left_char_head(p, pp, e, enc) == pp)
8316  return len - rslen;
8317  RB_GC_GUARD(rs);
8318  }
8319  return len;
8320 }
8321 
8327 static VALUE
8328 chomp_rs(int argc, const VALUE *argv)
8329 {
8330  rb_check_arity(argc, 0, 1);
8331  if (argc > 0) {
8332  VALUE rs = argv[0];
8333  if (!NIL_P(rs)) StringValue(rs);
8334  return rs;
8335  }
8336  else {
8337  return rb_rs;
8338  }
8339 }
8340 
8341 VALUE
8343 {
8344  long olen = RSTRING_LEN(str);
8345  long len = chompped_length(str, rs);
8346  if (len >= olen) return Qnil;
8347  str_modify_keep_cr(str);
8348  STR_SET_LEN(str, len);
8349  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8350  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8351  ENC_CODERANGE_CLEAR(str);
8352  }
8353  return str;
8354 }
8355 
8356 /*
8357  * call-seq:
8358  * str.chomp!(separator=$/) -> str or nil
8359  *
8360  * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
8361  * returning <i>str</i>, or <code>nil</code> if no modifications were made.
8362  */
8363 
8364 static VALUE
8365 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
8366 {
8367  VALUE rs;
8368  str_modifiable(str);
8369  if (RSTRING_LEN(str) == 0) return Qnil;
8370  rs = chomp_rs(argc, argv);
8371  if (NIL_P(rs)) return Qnil;
8372  return rb_str_chomp_string(str, rs);
8373 }
8374 
8375 
8376 /*
8377  * call-seq:
8378  * str.chomp(separator=$/) -> new_str
8379  *
8380  * Returns a new <code>String</code> with the given record separator removed
8381  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
8382  * changed from the default Ruby record separator, then <code>chomp</code> also
8383  * removes carriage return characters (that is it will remove <code>\n</code>,
8384  * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
8385  * it will remove all trailing newlines from the string.
8386  *
8387  * "hello".chomp #=> "hello"
8388  * "hello\n".chomp #=> "hello"
8389  * "hello\r\n".chomp #=> "hello"
8390  * "hello\n\r".chomp #=> "hello\n"
8391  * "hello\r".chomp #=> "hello"
8392  * "hello \n there".chomp #=> "hello \n there"
8393  * "hello".chomp("llo") #=> "he"
8394  * "hello\r\n\r\n".chomp('') #=> "hello"
8395  * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
8396  */
8397 
8398 static VALUE
8399 rb_str_chomp(int argc, VALUE *argv, VALUE str)
8400 {
8401  VALUE rs = chomp_rs(argc, argv);
8402  if (NIL_P(rs)) return rb_str_dup(str);
8403  return rb_str_subseq(str, 0, chompped_length(str, rs));
8404 }
8405 
8406 static long
8407 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8408 {
8409  const char *const start = s;
8410 
8411  if (!s || s >= e) return 0;
8412 
8413  /* remove spaces at head */
8414  if (single_byte_optimizable(str)) {
8415  while (s < e && ascii_isspace(*s)) s++;
8416  }
8417  else {
8418  while (s < e) {
8419  int n;
8420  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
8421 
8422  if (!rb_isspace(cc)) break;
8423  s += n;
8424  }
8425  }
8426  return s - start;
8427 }
8428 
8429 /*
8430  * call-seq:
8431  * str.lstrip! -> self or nil
8432  *
8433  * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
8434  * change was made. See also <code>String#rstrip!</code> and
8435  * <code>String#strip!</code>.
8436  *
8437  * Refer to <code>strip</code> for the definition of whitespace.
8438  *
8439  * " hello ".lstrip! #=> "hello "
8440  * "hello ".lstrip! #=> nil
8441  * "hello".lstrip! #=> nil
8442  */
8443 
8444 static VALUE
8445 rb_str_lstrip_bang(VALUE str)
8446 {
8447  rb_encoding *enc;
8448  char *start, *s;
8449  long olen, loffset;
8450 
8451  str_modify_keep_cr(str);
8452  enc = STR_ENC_GET(str);
8453  RSTRING_GETMEM(str, start, olen);
8454  loffset = lstrip_offset(str, start, start+olen, enc);
8455  if (loffset > 0) {
8456  long len = olen-loffset;
8457  s = start + loffset;
8458  memmove(start, s, len);
8459  STR_SET_LEN(str, len);
8460 #if !SHARABLE_MIDDLE_SUBSTRING
8461  TERM_FILL(start+len, rb_enc_mbminlen(enc));
8462 #endif
8463  return str;
8464  }
8465  return Qnil;
8466 }
8467 
8468 
8469 /*
8470  * call-seq:
8471  * str.lstrip -> new_str
8472  *
8473  * Returns a copy of <i>str</i> with leading whitespace removed. See also
8474  * <code>String#rstrip</code> and <code>String#strip</code>.
8475  *
8476  * Refer to <code>strip</code> for the definition of whitespace.
8477  *
8478  * " hello ".lstrip #=> "hello "
8479  * "hello".lstrip #=> "hello"
8480  */
8481 
8482 static VALUE
8483 rb_str_lstrip(VALUE str)
8484 {
8485  char *start;
8486  long len, loffset;
8487  RSTRING_GETMEM(str, start, len);
8488  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
8489  if (loffset <= 0) return rb_str_dup(str);
8490  return rb_str_subseq(str, loffset, len - loffset);
8491 }
8492 
8493 static long
8494 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8495 {
8496  const char *t;
8497 
8498  rb_str_check_dummy_enc(enc);
8499  if (!s || s >= e) return 0;
8500  t = e;
8501 
8502  /* remove trailing spaces or '\0's */
8503  if (single_byte_optimizable(str)) {
8504  unsigned char c;
8505  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
8506  }
8507  else {
8508  char *tp;
8509 
8510  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
8511  unsigned int c = rb_enc_codepoint(tp, e, enc);
8512  if (c && !rb_isspace(c)) break;
8513  t = tp;
8514  }
8515  }
8516  return e - t;
8517 }
8518 
8519 /*
8520  * call-seq:
8521  * str.rstrip! -> self or nil
8522  *
8523  * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
8524  * no change was made. See also <code>String#lstrip!</code> and
8525  * <code>String#strip!</code>.
8526  *
8527  * Refer to <code>strip</code> for the definition of whitespace.
8528  *
8529  * " hello ".rstrip! #=> " hello"
8530  * " hello".rstrip! #=> nil
8531  * "hello".rstrip! #=> nil
8532  */
8533 
8534 static VALUE
8535 rb_str_rstrip_bang(VALUE str)
8536 {
8537  rb_encoding *enc;
8538  char *start;
8539  long olen, roffset;
8540 
8541  str_modify_keep_cr(str);
8542  enc = STR_ENC_GET(str);
8543  RSTRING_GETMEM(str, start, olen);
8544  roffset = rstrip_offset(str, start, start+olen, enc);
8545  if (roffset > 0) {
8546  long len = olen - roffset;
8547 
8548  STR_SET_LEN(str, len);
8549 #if !SHARABLE_MIDDLE_SUBSTRING
8550  TERM_FILL(start+len, rb_enc_mbminlen(enc));
8551 #endif
8552  return str;
8553  }
8554  return Qnil;
8555 }
8556 
8557 
8558 /*
8559  * call-seq:
8560  * str.rstrip -> new_str
8561  *
8562  * Returns a copy of <i>str</i> with trailing whitespace removed. See also
8563  * <code>String#lstrip</code> and <code>String#strip</code>.
8564  *
8565  * Refer to <code>strip</code> for the definition of whitespace.
8566  *
8567  * " hello ".rstrip #=> " hello"
8568  * "hello".rstrip #=> "hello"
8569  */
8570 
8571 static VALUE
8572 rb_str_rstrip(VALUE str)
8573 {
8574  rb_encoding *enc;
8575  char *start;
8576  long olen, roffset;
8577 
8578  enc = STR_ENC_GET(str);
8579  RSTRING_GETMEM(str, start, olen);
8580  roffset = rstrip_offset(str, start, start+olen, enc);
8581 
8582  if (roffset <= 0) return rb_str_dup(str);
8583  return rb_str_subseq(str, 0, olen-roffset);
8584 }
8585 
8586 
8587 /*
8588  * call-seq:
8589  * str.strip! -> str or nil
8590  *
8591  * Removes leading and trailing whitespace from <i>str</i>. Returns
8592  * <code>nil</code> if <i>str</i> was not altered.
8593  *
8594  * Refer to <code>strip</code> for the definition of whitespace.
8595  */
8596 
8597 static VALUE
8598 rb_str_strip_bang(VALUE str)
8599 {
8600  char *start;
8601  long olen, loffset, roffset;
8602  rb_encoding *enc;
8603 
8604  str_modify_keep_cr(str);
8605  enc = STR_ENC_GET(str);
8606  RSTRING_GETMEM(str, start, olen);
8607  loffset = lstrip_offset(str, start, start+olen, enc);
8608  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
8609 
8610  if (loffset > 0 || roffset > 0) {
8611  long len = olen-roffset;
8612  if (loffset > 0) {
8613  len -= loffset;
8614  memmove(start, start + loffset, len);
8615  }
8616  STR_SET_LEN(str, len);
8617 #if !SHARABLE_MIDDLE_SUBSTRING
8618  TERM_FILL(start+len, rb_enc_mbminlen(enc));
8619 #endif
8620  return str;
8621  }
8622  return Qnil;
8623 }
8624 
8625 
8626 /*
8627  * call-seq:
8628  * str.strip -> new_str
8629  *
8630  * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
8631  *
8632  * Whitespace is defined as any of the following characters:
8633  * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
8634  *
8635  * " hello ".strip #=> "hello"
8636  * "\tgoodbye\r\n".strip #=> "goodbye"
8637  * "\x00\t\n\v\f\r ".strip #=> ""
8638  */
8639 
8640 static VALUE
8641 rb_str_strip(VALUE str)
8642 {
8643  char *start;
8644  long olen, loffset, roffset;
8645  rb_encoding *enc = STR_ENC_GET(str);
8646 
8647  RSTRING_GETMEM(str, start, olen);
8648  loffset = lstrip_offset(str, start, start+olen, enc);
8649  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
8650 
8651  if (loffset <= 0 && roffset <= 0) return rb_str_dup(str);
8652  return rb_str_subseq(str, loffset, olen-loffset-roffset);
8653 }
8654 
8655 static VALUE
8656 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
8657 {
8658  VALUE result, match;
8659  struct re_registers *regs;
8660  int i;
8661  long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
8662  if (pos >= 0) {
8663  if (BUILTIN_TYPE(pat) == T_STRING) {
8664  regs = NULL;
8665  end = pos + RSTRING_LEN(pat);
8666  }
8667  else {
8668  match = rb_backref_get();
8669  regs = RMATCH_REGS(match);
8670  end = END(0);
8671  }
8672  if (pos == end) {
8673  rb_encoding *enc = STR_ENC_GET(str);
8674  /*
8675  * Always consume at least one character of the input string
8676  */
8677  if (RSTRING_LEN(str) > end)
8678  *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
8679  RSTRING_END(str), enc);
8680  else
8681  *start = end + 1;
8682  }
8683  else {
8684  *start = end;
8685  }
8686  if (!regs || regs->num_regs == 1) {
8687  result = rb_str_subseq(str, pos, end - pos);
8688  OBJ_INFECT(result, pat);
8689  return result;
8690  }
8691  result = rb_ary_new2(regs->num_regs);
8692  for (i=1; i < regs->num_regs; i++) {
8693  VALUE s = Qnil;
8694  if (BEG(i) >= 0) {
8695  s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
8696  OBJ_INFECT(s, pat);
8697  }
8698  rb_ary_push(result, s);
8699  }
8700 
8701  return result;
8702  }
8703  return Qnil;
8704 }
8705 
8706 
8707 /*
8708  * call-seq:
8709  * str.scan(pattern) -> array
8710  * str.scan(pattern) {|match, ...| block } -> str
8711  *
8712  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
8713  * <code>Regexp</code> or a <code>String</code>). For each match, a result is
8714  * generated and either added to the result array or passed to the block. If
8715  * the pattern contains no groups, each individual result consists of the
8716  * matched string, <code>$&</code>. If the pattern contains groups, each
8717  * individual result is itself an array containing one entry per group.
8718  *
8719  * a = "cruel world"
8720  * a.scan(/\w+/) #=> ["cruel", "world"]
8721  * a.scan(/.../) #=> ["cru", "el ", "wor"]
8722  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
8723  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
8724  *
8725  * And the block form:
8726  *
8727  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
8728  * print "\n"
8729  * a.scan(/(.)(.)/) {|x,y| print y, x }
8730  * print "\n"
8731  *
8732  * <em>produces:</em>
8733  *
8734  * <<cruel>> <<world>>
8735  * rceu lowlr
8736  */
8737 
8738 static VALUE
8739 rb_str_scan(VALUE str, VALUE pat)
8740 {
8741  VALUE result;
8742  long start = 0;
8743  long last = -1, prev = 0;
8744  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
8745 
8746  pat = get_pat_quoted(pat, 1);
8747  mustnot_broken(str);
8748  if (!rb_block_given_p()) {
8749  VALUE ary = rb_ary_new();
8750 
8751  while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
8752  last = prev;
8753  prev = start;
8754  rb_ary_push(ary, result);
8755  }
8756  if (last >= 0) rb_pat_search(pat, str, last, 1);
8757  else rb_backref_set(Qnil);
8758  return ary;
8759  }
8760 
8761  while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
8762  last = prev;
8763  prev = start;
8764  rb_yield(result);
8765  str_mod_check(str, p, len);
8766  }
8767  if (last >= 0) rb_pat_search(pat, str, last, 1);
8768  return str;
8769 }
8770 
8771 
8772 /*
8773  * call-seq:
8774  * str.hex -> integer
8775  *
8776  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
8777  * (with an optional sign and an optional <code>0x</code>) and returns the
8778  * corresponding number. Zero is returned on error.
8779  *
8780  * "0x0a".hex #=> 10
8781  * "-1234".hex #=> -4660
8782  * "0".hex #=> 0
8783  * "wombat".hex #=> 0
8784  */
8785 
8786 static VALUE
8787 rb_str_hex(VALUE str)
8788 {
8789  return rb_str_to_inum(str, 16, FALSE);
8790 }
8791 
8792 
8793 /*
8794  * call-seq:
8795  * str.oct -> integer
8796  *
8797  * Treats leading characters of <i>str</i> as a string of octal digits (with an
8798  * optional sign) and returns the corresponding number. Returns 0 if the
8799  * conversion fails.
8800  *
8801  * "123".oct #=> 83
8802  * "-377".oct #=> -255
8803  * "bad".oct #=> 0
8804  * "0377bad".oct #=> 255
8805  *
8806  * If +str+ starts with <code>0</code>, radix indicators are honored.
8807  * See Kernel#Integer.
8808  */
8809 
8810 static VALUE
8811 rb_str_oct(VALUE str)
8812 {
8813  return rb_str_to_inum(str, -8, FALSE);
8814 }
8815 
8816 
8817 /*
8818  * call-seq:
8819  * str.crypt(salt_str) -> new_str
8820  *
8821  * Applies a one-way cryptographic hash to <i>str</i> by invoking the
8822  * standard library function <code>crypt(3)</code> with the given
8823  * salt string. While the format and the result are system and
8824  * implementation dependent, using a salt matching the regular
8825  * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and
8826  * safe on any platform, in which only the first two characters are
8827  * significant.
8828  *
8829  * This method is for use in system specific scripts, so if you want
8830  * a cross-platform hash function consider using Digest or OpenSSL
8831  * instead.
8832  */
8833 
8834 static VALUE
8835 rb_str_crypt(VALUE str, VALUE salt)
8836 {
8837 #undef LARGE_CRYPT_DATA
8838 #ifdef HAVE_CRYPT_R
8839 # if defined SIZEOF_CRYPT_DATA && SIZEOF_CRYPT_DATA <= 256
8840  struct crypt_data cdata, *const data = &cdata;
8841 # else
8842 # define LARGE_CRYPT_DATA
8843  struct crypt_data *data = ALLOC(struct crypt_data);
8844 # endif
8845 #else
8846  extern char *crypt(const char *, const char *);
8847 #endif
8848  VALUE result;
8849  const char *s, *saltp;
8850  char *res;
8851 #ifdef BROKEN_CRYPT
8852  char salt_8bit_clean[3];
8853 #endif
8854 
8855  StringValue(salt);
8856  mustnot_wchar(str);
8857  mustnot_wchar(salt);
8858  if (RSTRING_LEN(salt) < 2) {
8859  short_salt:
8860  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
8861  }
8862 
8863  s = StringValueCStr(str);
8864  saltp = RSTRING_PTR(salt);
8865  if (!saltp[0] || !saltp[1]) goto short_salt;
8866 #ifdef BROKEN_CRYPT
8867  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
8868  salt_8bit_clean[0] = saltp[0] & 0x7f;
8869  salt_8bit_clean[1] = saltp[1] & 0x7f;
8870  salt_8bit_clean[2] = '\0';
8871  saltp = salt_8bit_clean;
8872  }
8873 #endif
8874 #ifdef HAVE_CRYPT_R
8875 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
8876  data->initialized = 0;
8877 # endif
8878  res = crypt_r(s, saltp, data);
8879 #else
8880  res = crypt(s, saltp);
8881 #endif
8882  if (!res) {
8883 #ifdef LARGE_CRYPT_DATA
8884  int err = errno;
8885  xfree(data);
8886  errno = err;
8887 #endif
8888  rb_sys_fail("crypt");
8889  }
8890  result = rb_str_new_cstr(res);
8891 #ifdef LARGE_CRYPT_DATA
8892  xfree(data);
8893 #endif
8894  FL_SET_RAW(result, OBJ_TAINTED_RAW(str) | OBJ_TAINTED_RAW(salt));
8895  return result;
8896 }
8897 
8898 
8899 /*
8900  * call-seq:
8901  * str.ord -> integer
8902  *
8903  * Return the <code>Integer</code> ordinal of a one-character string.
8904  *
8905  * "a".ord #=> 97
8906  */
8907 
8908 VALUE
8910 {
8911  unsigned int c;
8912 
8914  return UINT2NUM(c);
8915 }
8916 /*
8917  * call-seq:
8918  * str.sum(n=16) -> integer
8919  *
8920  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
8921  * where <em>n</em> is the optional <code>Integer</code> parameter, defaulting
8922  * to 16. The result is simply the sum of the binary value of each byte in
8923  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
8924  * checksum.
8925  */
8926 
8927 static VALUE
8928 rb_str_sum(int argc, VALUE *argv, VALUE str)
8929 {
8930  VALUE vbits;
8931  int bits;
8932  char *ptr, *p, *pend;
8933  long len;
8934  VALUE sum = INT2FIX(0);
8935  unsigned long sum0 = 0;
8936 
8937  if (argc == 0) {
8938  bits = 16;
8939  }
8940  else {
8941  rb_scan_args(argc, argv, "01", &vbits);
8942  bits = NUM2INT(vbits);
8943  if (bits < 0)
8944  bits = 0;
8945  }
8946  ptr = p = RSTRING_PTR(str);
8947  len = RSTRING_LEN(str);
8948  pend = p + len;
8949 
8950  while (p < pend) {
8951  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
8952  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
8953  str_mod_check(str, ptr, len);
8954  sum0 = 0;
8955  }
8956  sum0 += (unsigned char)*p;
8957  p++;
8958  }
8959 
8960  if (bits == 0) {
8961  if (sum0) {
8962  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
8963  }
8964  }
8965  else {
8966  if (sum == INT2FIX(0)) {
8967  if (bits < (int)sizeof(long)*CHAR_BIT) {
8968  sum0 &= (((unsigned long)1)<<bits)-1;
8969  }
8970  sum = LONG2FIX(sum0);
8971  }
8972  else {
8973  VALUE mod;
8974 
8975  if (sum0) {
8976  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
8977  }
8978 
8979  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
8980  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
8981  sum = rb_funcall(sum, '&', 1, mod);
8982  }
8983  }
8984  return sum;
8985 }
8986 
8987 static VALUE
8988 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
8989 {
8990  rb_encoding *enc;
8991  VALUE w;
8992  long width, len, flen = 1, fclen = 1;
8993  VALUE res;
8994  char *p;
8995  const char *f = " ";
8996  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
8997  VALUE pad;
8998  int singlebyte = 1, cr;
8999  int termlen;
9000 
9001  rb_scan_args(argc, argv, "11", &w, &pad);
9002  enc = STR_ENC_GET(str);
9003  termlen = rb_enc_mbminlen(enc);
9004  width = NUM2LONG(w);
9005  if (argc == 2) {
9006  StringValue(pad);
9007  enc = rb_enc_check(str, pad);
9008  f = RSTRING_PTR(pad);
9009  flen = RSTRING_LEN(pad);
9010  fclen = str_strlen(pad, enc); /* rb_enc_check */
9011  singlebyte = single_byte_optimizable(pad);
9012  if (flen == 0 || fclen == 0) {
9013  rb_raise(rb_eArgError, "zero width padding");
9014  }
9015  }
9016  len = str_strlen(str, enc); /* rb_enc_check */
9017  if (width < 0 || len >= width) return rb_str_dup(str);
9018  n = width - len;
9019  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
9020  rlen = n - llen;
9021  cr = ENC_CODERANGE(str);
9022  if (flen > 1) {
9023  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
9024  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
9025  }
9026  size = RSTRING_LEN(str);
9027  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
9028  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
9029  (len += llen2 + rlen2) >= LONG_MAX - size) {
9030  rb_raise(rb_eArgError, "argument too big");
9031  }
9032  len += size;
9033  res = str_new0(rb_obj_class(str), 0, len, termlen);
9034  p = RSTRING_PTR(res);
9035  if (flen <= 1) {
9036  memset(p, *f, llen);
9037  p += llen;
9038  }
9039  else {
9040  while (llen >= fclen) {
9041  memcpy(p,f,flen);
9042  p += flen;
9043  llen -= fclen;
9044  }
9045  if (llen > 0) {
9046  memcpy(p, f, llen2);
9047  p += llen2;
9048  }
9049  }
9050  memcpy(p, RSTRING_PTR(str), size);
9051  p += size;
9052  if (flen <= 1) {
9053  memset(p, *f, rlen);
9054  p += rlen;
9055  }
9056  else {
9057  while (rlen >= fclen) {
9058  memcpy(p,f,flen);
9059  p += flen;
9060  rlen -= fclen;
9061  }
9062  if (rlen > 0) {
9063  memcpy(p, f, rlen2);
9064  p += rlen2;
9065  }
9066  }
9067  TERM_FILL(p, termlen);
9068  STR_SET_LEN(res, p-RSTRING_PTR(res));
9069  OBJ_INFECT_RAW(res, str);
9070  if (!NIL_P(pad)) OBJ_INFECT_RAW(res, pad);
9071  rb_enc_associate(res, enc);
9072  if (argc == 2)
9073  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
9074  if (cr != ENC_CODERANGE_BROKEN)
9075  ENC_CODERANGE_SET(res, cr);
9076 
9077  RB_GC_GUARD(pad);
9078  return res;
9079 }
9080 
9081 
9082 /*
9083  * call-seq:
9084  * str.ljust(integer, padstr=' ') -> new_str
9085  *
9086  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9087  * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
9088  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9089  *
9090  * "hello".ljust(4) #=> "hello"
9091  * "hello".ljust(20) #=> "hello "
9092  * "hello".ljust(20, '1234') #=> "hello123412341234123"
9093  */
9094 
9095 static VALUE
9096 rb_str_ljust(int argc, VALUE *argv, VALUE str)
9097 {
9098  return rb_str_justify(argc, argv, str, 'l');
9099 }
9100 
9101 
9102 /*
9103  * call-seq:
9104  * str.rjust(integer, padstr=' ') -> new_str
9105  *
9106  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9107  * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
9108  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9109  *
9110  * "hello".rjust(4) #=> "hello"
9111  * "hello".rjust(20) #=> " hello"
9112  * "hello".rjust(20, '1234') #=> "123412341234123hello"
9113  */
9114 
9115 static VALUE
9116 rb_str_rjust(int argc, VALUE *argv, VALUE str)
9117 {
9118  return rb_str_justify(argc, argv, str, 'r');
9119 }
9120 
9121 
9122 /*
9123  * call-seq:
9124  * str.center(width, padstr=' ') -> new_str
9125  *
9126  * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
9127  * returns a new String of length +width+ with +str+ centered and padded with
9128  * +padstr+; otherwise, returns +str+.
9129  *
9130  * "hello".center(4) #=> "hello"
9131  * "hello".center(20) #=> " hello "
9132  * "hello".center(20, '123') #=> "1231231hello12312312"
9133  */
9134 
9135 static VALUE
9136 rb_str_center(int argc, VALUE *argv, VALUE str)
9137 {
9138  return rb_str_justify(argc, argv, str, 'c');
9139 }
9140 
9141 /*
9142  * call-seq:
9143  * str.partition(sep) -> [head, sep, tail]
9144  * str.partition(regexp) -> [head, match, tail]
9145  *
9146  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
9147  * and returns the part before it, the match, and the part
9148  * after it.
9149  * If it is not found, returns two empty strings and <i>str</i>.
9150  *
9151  * "hello".partition("l") #=> ["he", "l", "lo"]
9152  * "hello".partition("x") #=> ["hello", "", ""]
9153  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
9154  */
9155 
9156 static VALUE
9157 rb_str_partition(VALUE str, VALUE sep)
9158 {
9159  long pos;
9160 
9161  sep = get_pat_quoted(sep, 0);
9162  if (RB_TYPE_P(sep, T_REGEXP)) {
9163  pos = rb_reg_search(sep, str, 0, 0);
9164  if (pos < 0) {
9165  failed:
9166  return rb_ary_new3(3, rb_str_dup(str), str_new_empty(str), str_new_empty(str));
9167  }
9168  sep = rb_str_subpat(str, sep, INT2FIX(0));
9169  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
9170  }
9171  else {
9172  pos = rb_str_index(str, sep, 0);
9173  if (pos < 0) goto failed;
9174  }
9175  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9176  sep,
9177  rb_str_subseq(str, pos+RSTRING_LEN(sep),
9178  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9179 }
9180 
9181 /*
9182  * call-seq:
9183  * str.rpartition(sep) -> [head, sep, tail]
9184  * str.rpartition(regexp) -> [head, match, tail]
9185  *
9186  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
9187  * of the string, and returns the part before it, the match, and the part
9188  * after it.
9189  * If it is not found, returns two empty strings and <i>str</i>.
9190  *
9191  * "hello".rpartition("l") #=> ["hel", "l", "o"]
9192  * "hello".rpartition("x") #=> ["", "", "hello"]
9193  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
9194  */
9195 
9196 static VALUE
9197 rb_str_rpartition(VALUE str, VALUE sep)
9198 {
9199  long pos = RSTRING_LEN(str);
9200  int regex = FALSE;
9201 
9202  if (RB_TYPE_P(sep, T_REGEXP)) {
9203  pos = rb_reg_search(sep, str, pos, 1);
9204  regex = TRUE;
9205  }
9206  else {
9207  VALUE tmp;
9208 
9209  tmp = rb_check_string_type(sep);
9210  if (NIL_P(tmp)) {
9211  rb_raise(rb_eTypeError, "type mismatch: %s given",
9212  rb_obj_classname(sep));
9213  }
9214  sep = tmp;
9215  pos = rb_str_sublen(str, pos);
9216  pos = rb_str_rindex(str, sep, pos);
9217  }
9218  if (pos < 0) {
9219  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), rb_str_dup(str));
9220  }
9221  if (regex) {
9222  sep = rb_reg_nth_match(0, rb_backref_get());
9223  }
9224  else {
9225  pos = rb_str_offset(str, pos);
9226  }
9227  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9228  sep,
9229  rb_str_subseq(str, pos+RSTRING_LEN(sep),
9230  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9231 }
9232 
9233 /*
9234  * call-seq:
9235  * str.start_with?([prefixes]+) -> true or false
9236  *
9237  * Returns true if +str+ starts with one of the +prefixes+ given.
9238  *
9239  * "hello".start_with?("hell") #=> true
9240  *
9241  * # returns true if one of the prefixes matches.
9242  * "hello".start_with?("heaven", "hell") #=> true
9243  * "hello".start_with?("heaven", "paradise") #=> false
9244  */
9245 
9246 static VALUE
9247 rb_str_start_with(int argc, VALUE *argv, VALUE str)
9248 {
9249  int i;
9250 
9251  for (i=0; i<argc; i++) {
9252  VALUE tmp = argv[i];
9253  switch (TYPE(tmp)) {
9254  case T_REGEXP:
9255  {
9256  bool r = rb_reg_start_with_p(tmp, str);
9257  if (r) return Qtrue;
9258  }
9259  break;
9260  default:
9261  StringValue(tmp);
9262  rb_enc_check(str, tmp);
9263  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9264  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9265  return Qtrue;
9266  }
9267  }
9268  return Qfalse;
9269 }
9270 
9271 /*
9272  * call-seq:
9273  * str.end_with?([suffixes]+) -> true or false
9274  *
9275  * Returns true if +str+ ends with one of the +suffixes+ given.
9276  *
9277  * "hello".end_with?("ello") #=> true
9278  *
9279  * # returns true if one of the +suffixes+ matches.
9280  * "hello".end_with?("heaven", "ello") #=> true
9281  * "hello".end_with?("heaven", "paradise") #=> false
9282  */
9283 
9284 static VALUE
9285 rb_str_end_with(int argc, VALUE *argv, VALUE str)
9286 {
9287  int i;
9288  char *p, *s, *e;
9289  rb_encoding *enc;
9290 
9291  for (i=0; i<argc; i++) {
9292  VALUE tmp = argv[i];
9293  StringValue(tmp);
9294  enc = rb_enc_check(str, tmp);
9295  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9296  p = RSTRING_PTR(str);
9297  e = p + RSTRING_LEN(str);
9298  s = e - RSTRING_LEN(tmp);
9299  if (rb_enc_left_char_head(p, s, e, enc) != s)
9300  continue;
9301  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9302  return Qtrue;
9303  }
9304  return Qfalse;
9305 }
9306 
9316 static long
9317 deleted_prefix_length(VALUE str, VALUE prefix)
9318 {
9319  char *strptr, *prefixptr;
9320  long olen, prefixlen;
9321 
9322  StringValue(prefix);
9323  if (is_broken_string(prefix)) return 0;
9324  rb_enc_check(str, prefix);
9325 
9326  /* return 0 if not start with prefix */
9327  prefixlen = RSTRING_LEN(prefix);
9328  if (prefixlen <= 0) return 0;
9329  olen = RSTRING_LEN(str);
9330  if (olen < prefixlen) return 0;
9331  strptr = RSTRING_PTR(str);
9332  prefixptr = RSTRING_PTR(prefix);
9333  if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
9334 
9335  return prefixlen;
9336 }
9337 
9338 /*
9339  * call-seq:
9340  * str.delete_prefix!(prefix) -> self or nil
9341  *
9342  * Deletes leading <code>prefix</code> from <i>str</i>, returning
9343  * <code>nil</code> if no change was made.
9344  *
9345  * "hello".delete_prefix!("hel") #=> "lo"
9346  * "hello".delete_prefix!("llo") #=> nil
9347  */
9348 
9349 static VALUE
9350 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
9351 {
9352  long prefixlen;
9353  str_modify_keep_cr(str);
9354 
9355  prefixlen = deleted_prefix_length(str, prefix);
9356  if (prefixlen <= 0) return Qnil;
9357 
9358  return rb_str_drop_bytes(str, prefixlen);
9359 }
9360 
9361 /*
9362  * call-seq:
9363  * str.delete_prefix(prefix) -> new_str
9364  *
9365  * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
9366  *
9367  * "hello".delete_prefix("hel") #=> "lo"
9368  * "hello".delete_prefix("llo") #=> "hello"
9369  */
9370 
9371 static VALUE
9372 rb_str_delete_prefix(VALUE str, VALUE prefix)
9373 {
9374  long prefixlen;
9375 
9376  prefixlen = deleted_prefix_length(str, prefix);
9377  if (prefixlen <= 0) return rb_str_dup(str);
9378 
9379  return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
9380 }
9381 
9391 static long
9392 deleted_suffix_length(VALUE str, VALUE suffix)
9393 {
9394  char *strptr, *suffixptr, *s;
9395  long olen, suffixlen;
9396  rb_encoding *enc;
9397 
9398  StringValue(suffix);
9399  if (is_broken_string(suffix)) return 0;
9400  enc = rb_enc_check(str, suffix);
9401 
9402  /* return 0 if not start with suffix */
9403  suffixlen = RSTRING_LEN(suffix);
9404  if (suffixlen <= 0) return 0;
9405  olen = RSTRING_LEN(str);
9406  if (olen < suffixlen) return 0;
9407  strptr = RSTRING_PTR(str);
9408  suffixptr = RSTRING_PTR(suffix);
9409  s = strptr + olen - suffixlen;
9410  if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
9411  if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
9412 
9413  return suffixlen;
9414 }
9415 
9416 /*
9417  * call-seq:
9418  * str.delete_suffix!(suffix) -> self or nil
9419  *
9420  * Deletes trailing <code>suffix</code> from <i>str</i>, returning
9421  * <code>nil</code> if no change was made.
9422  *
9423  * "hello".delete_suffix!("llo") #=> "he"
9424  * "hello".delete_suffix!("hel") #=> nil
9425  */
9426 
9427 static VALUE
9428 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
9429 {
9430  long olen, suffixlen, len;
9431  str_modifiable(str);
9432 
9433  suffixlen = deleted_suffix_length(str, suffix);
9434  if (suffixlen <= 0) return Qnil;
9435 
9436  olen = RSTRING_LEN(str);
9437  str_modify_keep_cr(str);
9438  len = olen - suffixlen;
9439  STR_SET_LEN(str, len);
9440  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9441  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9442  ENC_CODERANGE_CLEAR(str);
9443  }
9444  return str;
9445 }
9446 
9447 /*
9448  * call-seq:
9449  * str.delete_suffix(suffix) -> new_str
9450  *
9451  * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
9452  *
9453  * "hello".delete_suffix("llo") #=> "he"
9454  * "hello".delete_suffix("hel") #=> "hello"
9455  */
9456 
9457 static VALUE
9458 rb_str_delete_suffix(VALUE str, VALUE suffix)
9459 {
9460  long suffixlen;
9461 
9462  suffixlen = deleted_suffix_length(str, suffix);
9463  if (suffixlen <= 0) return rb_str_dup(str);
9464 
9465  return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
9466 }
9467 
9468 void
9469 rb_str_setter(VALUE val, ID id, VALUE *var)
9470 {
9471  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
9472  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
9473  }
9474  *var = val;
9475 }
9476 
9477 static void
9478 rb_fs_setter(VALUE val, ID id, VALUE *var)
9479 {
9480  val = rb_fs_check(val);
9481  if (!val) {
9483  "value of %"PRIsVALUE" must be String or Regexp",
9484  rb_id2str(id));
9485  }
9486  *var = val;
9487 }
9488 
9489 
9490 /*
9491  * call-seq:
9492  * str.force_encoding(encoding) -> str
9493  *
9494  * Changes the encoding to +encoding+ and returns self.
9495  */
9496 
9497 static VALUE
9498 rb_str_force_encoding(VALUE str, VALUE enc)
9499 {
9500  str_modifiable(str);
9501  rb_enc_associate(str, rb_to_encoding(enc));
9502  ENC_CODERANGE_CLEAR(str);
9503  return str;
9504 }
9505 
9506 /*
9507  * call-seq:
9508  * str.b -> str
9509  *
9510  * Returns a copied string whose encoding is ASCII-8BIT.
9511  */
9512 
9513 static VALUE
9514 rb_str_b(VALUE str)
9515 {
9516  VALUE str2 = str_alloc(rb_cString);
9517  str_replace_shared_without_enc(str2, str);
9518  OBJ_INFECT_RAW(str2, str);
9519  ENC_CODERANGE_CLEAR(str2);
9520  return str2;
9521 }
9522 
9523 /*
9524  * call-seq:
9525  * str.valid_encoding? -> true or false
9526  *
9527  * Returns true for a string which is encoded correctly.
9528  *
9529  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
9530  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
9531  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
9532  */
9533 
9534 static VALUE
9535 rb_str_valid_encoding_p(VALUE str)
9536 {
9537  int cr = rb_enc_str_coderange(str);
9538 
9539  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
9540 }
9541 
9542 /*
9543  * call-seq:
9544  * str.ascii_only? -> true or false
9545  *
9546  * Returns true for a string which has only ASCII characters.
9547  *
9548  * "abc".force_encoding("UTF-8").ascii_only? #=> true
9549  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
9550  */
9551 
9552 static VALUE
9553 rb_str_is_ascii_only_p(VALUE str)
9554 {
9555  int cr = rb_enc_str_coderange(str);
9556 
9557  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
9558 }
9559 
9574 VALUE
9575 rb_str_ellipsize(VALUE str, long len)
9576 {
9577  static const char ellipsis[] = "...";
9578  const long ellipsislen = sizeof(ellipsis) - 1;
9579  rb_encoding *const enc = rb_enc_get(str);
9580  const long blen = RSTRING_LEN(str);
9581  const char *const p = RSTRING_PTR(str), *e = p + blen;
9582  VALUE estr, ret = 0;
9583 
9584  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
9585  if (len * rb_enc_mbminlen(enc) >= blen ||
9586  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
9587  ret = str;
9588  }
9589  else if (len <= ellipsislen ||
9590  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
9591  if (rb_enc_asciicompat(enc)) {
9592  ret = rb_str_new_with_class(str, ellipsis, len);
9593  rb_enc_associate(ret, enc);
9594  }
9595  else {
9596  estr = rb_usascii_str_new(ellipsis, len);
9597  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
9598  }
9599  }
9600  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
9601  rb_str_cat(ret, ellipsis, ellipsislen);
9602  }
9603  else {
9604  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
9605  rb_enc_from_encoding(enc), 0, Qnil);
9606  rb_str_append(ret, estr);
9607  }
9608  return ret;
9609 }
9610 
9611 static VALUE
9612 str_compat_and_valid(VALUE str, rb_encoding *enc)
9613 {
9614  int cr;
9615  str = StringValue(str);
9616  cr = rb_enc_str_coderange(str);
9617  if (cr == ENC_CODERANGE_BROKEN) {
9618  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
9619  }
9620  else {
9621  rb_encoding *e = STR_ENC_GET(str);
9622  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
9623  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
9624  rb_enc_name(enc), rb_enc_name(e));
9625  }
9626  }
9627  return str;
9628 }
9629 
9630 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
9631 
9637 VALUE
9639 {
9640  rb_encoding *enc = STR_ENC_GET(str);
9641  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
9642 }
9643 
9644 VALUE
9646 {
9647  int cr = ENC_CODERANGE_UNKNOWN;
9648  if (enc == STR_ENC_GET(str)) {
9649  /* cached coderange makes sense only when enc equals the
9650  * actual encoding of str */
9651  cr = ENC_CODERANGE(str);
9652  }
9653  return enc_str_scrub(enc, str, repl, cr);
9654 }
9655 
9656 static VALUE
9657 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
9658 {
9659  int encidx;
9660  VALUE buf = Qnil;
9661  const char *rep;
9662  long replen = -1;
9663  int tainted = 0;
9664 
9665  if (rb_block_given_p()) {
9666  if (!NIL_P(repl))
9667  rb_raise(rb_eArgError, "both of block and replacement given");
9668  replen = 0;
9669  }
9670 
9671  if (ENC_CODERANGE_CLEAN_P(cr))
9672  return Qnil;
9673 
9674  if (!NIL_P(repl)) {
9675  repl = str_compat_and_valid(repl, enc);
9676  tainted = OBJ_TAINTED_RAW(repl);
9677  }
9678 
9679  if (rb_enc_dummy_p(enc)) {
9680  return Qnil;
9681  }
9682  encidx = rb_enc_to_index(enc);
9683 
9684 #define DEFAULT_REPLACE_CHAR(str) do { \
9685  static const char replace[sizeof(str)-1] = str; \
9686  rep = replace; replen = (int)sizeof(replace); \
9687  } while (0)
9688 
9689  if (rb_enc_asciicompat(enc)) {
9690  const char *p = RSTRING_PTR(str);
9691  const char *e = RSTRING_END(str);
9692  const char *p1 = p;
9693  int rep7bit_p;
9694  if (!replen) {
9695  rep = NULL;
9696  rep7bit_p = FALSE;
9697  }
9698  else if (!NIL_P(repl)) {
9699  rep = RSTRING_PTR(repl);
9700  replen = RSTRING_LEN(repl);
9701  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
9702  }
9703  else if (encidx == rb_utf8_encindex()) {
9704  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
9705  rep7bit_p = FALSE;
9706  }
9707  else {
9708  DEFAULT_REPLACE_CHAR("?");
9709  rep7bit_p = TRUE;
9710  }
9711  cr = ENC_CODERANGE_7BIT;
9712 
9713  p = search_nonascii(p, e);
9714  if (!p) {
9715  p = e;
9716  }
9717  while (p < e) {
9718  int ret = rb_enc_precise_mbclen(p, e, enc);
9719  if (MBCLEN_NEEDMORE_P(ret)) {
9720  break;
9721  }
9722  else if (MBCLEN_CHARFOUND_P(ret)) {
9723  cr = ENC_CODERANGE_VALID;
9724  p += MBCLEN_CHARFOUND_LEN(ret);
9725  }
9726  else if (MBCLEN_INVALID_P(ret)) {
9727  /*
9728  * p1~p: valid ascii/multibyte chars
9729  * p ~e: invalid bytes + unknown bytes
9730  */
9731  long clen = rb_enc_mbmaxlen(enc);
9732  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
9733  if (p > p1) {
9734  rb_str_buf_cat(buf, p1, p - p1);
9735  }
9736 
9737  if (e - p < clen) clen = e - p;
9738  if (clen <= 2) {
9739  clen = 1;
9740  }
9741  else {
9742  const char *q = p;
9743  clen--;
9744  for (; clen > 1; clen--) {
9745  ret = rb_enc_precise_mbclen(q, q + clen, enc);
9746  if (MBCLEN_NEEDMORE_P(ret)) break;
9747  if (MBCLEN_INVALID_P(ret)) continue;
9748  UNREACHABLE;
9749  }
9750  }
9751  if (rep) {
9752  rb_str_buf_cat(buf, rep, replen);
9753  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
9754  }
9755  else {
9756  repl = rb_yield(rb_enc_str_new(p, clen, enc));
9757  repl = str_compat_and_valid(repl, enc);
9758  tainted |= OBJ_TAINTED_RAW(repl);
9759  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9760  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
9761  cr = ENC_CODERANGE_VALID;
9762  }
9763  p += clen;
9764  p1 = p;
9765  p = search_nonascii(p, e);
9766  if (!p) {
9767  p = e;
9768  break;
9769  }
9770  }
9771  else {
9772  UNREACHABLE;
9773  }
9774  }
9775  if (NIL_P(buf)) {
9776  if (p == e) {
9777  ENC_CODERANGE_SET(str, cr);
9778  return Qnil;
9779  }
9780  buf = rb_str_buf_new(RSTRING_LEN(str));
9781  }
9782  if (p1 < p) {
9783  rb_str_buf_cat(buf, p1, p - p1);
9784  }
9785  if (p < e) {
9786  if (rep) {
9787  rb_str_buf_cat(buf, rep, replen);
9788  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
9789  }
9790  else {
9791  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
9792  repl = str_compat_and_valid(repl, enc);
9793  tainted |= OBJ_TAINTED_RAW(repl);
9794  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9795  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
9796  cr = ENC_CODERANGE_VALID;
9797  }
9798  }
9799  }
9800  else {
9801  /* ASCII incompatible */
9802  const char *p = RSTRING_PTR(str);
9803  const char *e = RSTRING_END(str);
9804  const char *p1 = p;
9805  long mbminlen = rb_enc_mbminlen(enc);
9806  if (!replen) {
9807  rep = NULL;
9808  }
9809  else if (!NIL_P(repl)) {
9810  rep = RSTRING_PTR(repl);
9811  replen = RSTRING_LEN(repl);
9812  }
9813  else if (encidx == ENCINDEX_UTF_16BE) {
9814  DEFAULT_REPLACE_CHAR("\xFF\xFD");
9815  }
9816  else if (encidx == ENCINDEX_UTF_16LE) {
9817  DEFAULT_REPLACE_CHAR("\xFD\xFF");
9818  }
9819  else if (encidx == ENCINDEX_UTF_32BE) {
9820  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
9821  }
9822  else if (encidx == ENCINDEX_UTF_32LE) {
9823  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
9824  }
9825  else {
9826  DEFAULT_REPLACE_CHAR("?");
9827  }
9828 
9829  while (p < e) {
9830  int ret = rb_enc_precise_mbclen(p, e, enc);
9831  if (MBCLEN_NEEDMORE_P(ret)) {
9832  break;
9833  }
9834  else if (MBCLEN_CHARFOUND_P(ret)) {
9835  p += MBCLEN_CHARFOUND_LEN(ret);
9836  }
9837  else if (MBCLEN_INVALID_P(ret)) {
9838  const char *q = p;
9839  long clen = rb_enc_mbmaxlen(enc);
9840  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
9841  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
9842 
9843  if (e - p < clen) clen = e - p;
9844  if (clen <= mbminlen * 2) {
9845  clen = mbminlen;
9846  }
9847  else {
9848  clen -= mbminlen;
9849  for (; clen > mbminlen; clen-=mbminlen) {
9850  ret = rb_enc_precise_mbclen(q, q + clen, enc);
9851  if (MBCLEN_NEEDMORE_P(ret)) break;
9852  if (MBCLEN_INVALID_P(ret)) continue;
9853  UNREACHABLE;
9854  }
9855  }
9856  if (rep) {
9857  rb_str_buf_cat(buf, rep, replen);
9858  }
9859  else {
9860  repl = rb_yield(rb_enc_str_new(p, clen, enc));
9861  repl = str_compat_and_valid(repl, enc);
9862  tainted |= OBJ_TAINTED_RAW(repl);
9863  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9864  }
9865  p += clen;
9866  p1 = p;
9867  }
9868  else {
9869  UNREACHABLE;
9870  }
9871  }
9872  if (NIL_P(buf)) {
9873  if (p == e) {
9875  return Qnil;
9876  }
9877  buf = rb_str_buf_new(RSTRING_LEN(str));
9878  }
9879  if (p1 < p) {
9880  rb_str_buf_cat(buf, p1, p - p1);
9881  }
9882  if (p < e) {
9883  if (rep) {
9884  rb_str_buf_cat(buf, rep, replen);
9885  }
9886  else {
9887  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
9888  repl = str_compat_and_valid(repl, enc);
9889  tainted |= OBJ_TAINTED_RAW(repl);
9890  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9891  }
9892  }
9893  cr = ENC_CODERANGE_VALID;
9894  }
9895  FL_SET_RAW(buf, tainted|OBJ_TAINTED_RAW(str));
9896  ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
9897  return buf;
9898 }
9899 
9900 /*
9901  * call-seq:
9902  * str.scrub -> new_str
9903  * str.scrub(repl) -> new_str
9904  * str.scrub{|bytes|} -> new_str
9905  *
9906  * If the string is invalid byte sequence then replace invalid bytes with given replacement
9907  * character, else returns self.
9908  * If block is given, replace invalid bytes with returned value of the block.
9909  *
9910  * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
9911  * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
9912  * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
9913  */
9914 static VALUE
9915 str_scrub(int argc, VALUE *argv, VALUE str)
9916 {
9917  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
9918  VALUE new = rb_str_scrub(str, repl);
9919  return NIL_P(new) ? rb_str_dup(str): new;
9920 }
9921 
9922 /*
9923  * call-seq:
9924  * str.scrub! -> str
9925  * str.scrub!(repl) -> str
9926  * str.scrub!{|bytes|} -> str
9927  *
9928  * If the string is invalid byte sequence then replace invalid bytes with given replacement
9929  * character, else returns self.
9930  * If block is given, replace invalid bytes with returned value of the block.
9931  *
9932  * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
9933  * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
9934  * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
9935  */
9936 static VALUE
9937 str_scrub_bang(int argc, VALUE *argv, VALUE str)
9938 {
9939  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
9940  VALUE new = rb_str_scrub(str, repl);
9941  if (!NIL_P(new)) rb_str_replace(str, new);
9942  return str;
9943 }
9944 
9945 static ID id_normalize;
9946 static ID id_normalized_p;
9947 static VALUE mUnicodeNormalize;
9948 
9949 static VALUE
9950 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
9951 {
9952  static int UnicodeNormalizeRequired = 0;
9953  VALUE argv2[2];
9954 
9955  if (!UnicodeNormalizeRequired) {
9956  rb_require("unicode_normalize/normalize.rb");
9957  UnicodeNormalizeRequired = 1;
9958  }
9959  argv2[0] = str;
9960  rb_scan_args(argc, argv, "01", &argv2[1]);
9961  return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
9962 }
9963 
9964 /*
9965  * call-seq:
9966  * str.unicode_normalize(form=:nfc)
9967  *
9968  * Unicode Normalization---Returns a normalized form of +str+,
9969  * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
9970  * The normalization form used is determined by +form+, which can
9971  * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
9972  * The default is +:nfc+.
9973  *
9974  * If the string is not in a Unicode Encoding, then an Exception is raised.
9975  * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
9976  * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
9977  * Anything other than UTF-8 is implemented by converting to UTF-8,
9978  * which makes it slower than UTF-8.
9979  *
9980  * "a\u0300".unicode_normalize #=> "\u00E0"
9981  * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
9982  * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
9983  * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
9984  * #=> Encoding::CompatibilityError raised
9985  */
9986 static VALUE
9987 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
9988 {
9989  return unicode_normalize_common(argc, argv, str, id_normalize);
9990 }
9991 
9992 /*
9993  * call-seq:
9994  * str.unicode_normalize!(form=:nfc)
9995  *
9996  * Destructive version of String#unicode_normalize, doing Unicode
9997  * normalization in place.
9998  */
9999 static VALUE
10000 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
10001 {
10002  return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
10003 }
10004 
10005 /* call-seq:
10006  * str.unicode_normalized?(form=:nfc)
10007  *
10008  * Checks whether +str+ is in Unicode normalization form +form+,
10009  * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10010  * The default is +:nfc+.
10011  *
10012  * If the string is not in a Unicode Encoding, then an Exception is raised.
10013  * For details, see String#unicode_normalize.
10014  *
10015  * "a\u0300".unicode_normalized? #=> false
10016  * "a\u0300".unicode_normalized?(:nfd) #=> true
10017  * "\u00E0".unicode_normalized? #=> true
10018  * "\u00E0".unicode_normalized?(:nfd) #=> false
10019  * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
10020  * #=> Encoding::CompatibilityError raised
10021  */
10022 static VALUE
10023 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
10024 {
10025  return unicode_normalize_common(argc, argv, str, id_normalized_p);
10026 }
10027 
10028 /**********************************************************************
10029  * Document-class: Symbol
10030  *
10031  * <code>Symbol</code> objects represent names and some strings
10032  * inside the Ruby
10033  * interpreter. They are generated using the <code>:name</code> and
10034  * <code>:"string"</code> literals
10035  * syntax, and by the various <code>to_sym</code> methods. The same
10036  * <code>Symbol</code> object will be created for a given name or string
10037  * for the duration of a program's execution, regardless of the context
10038  * or meaning of that name. Thus if <code>Fred</code> is a constant in
10039  * one context, a method in another, and a class in a third, the
10040  * <code>Symbol</code> <code>:Fred</code> will be the same object in
10041  * all three contexts.
10042  *
10043  * module One
10044  * class Fred
10045  * end
10046  * $f1 = :Fred
10047  * end
10048  * module Two
10049  * Fred = 1
10050  * $f2 = :Fred
10051  * end
10052  * def Fred()
10053  * end
10054  * $f3 = :Fred
10055  * $f1.object_id #=> 2514190
10056  * $f2.object_id #=> 2514190
10057  * $f3.object_id #=> 2514190
10058  *
10059  */
10060 
10061 
10062 /*
10063  * call-seq:
10064  * sym == obj -> true or false
10065  *
10066  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
10067  * symbol, returns <code>true</code>.
10068  */
10069 
10070 #define sym_equal rb_obj_equal
10071 
10072 static int
10073 sym_printable(const char *s, const char *send, rb_encoding *enc)
10074 {
10075  while (s < send) {
10076  int n;
10077  int c = rb_enc_precise_mbclen(s, send, enc);
10078 
10079  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
10080  n = MBCLEN_CHARFOUND_LEN(c);
10081  c = rb_enc_mbc_to_codepoint(s, send, enc);
10082  if (!rb_enc_isprint(c, enc)) return FALSE;
10083  s += n;
10084  }
10085  return TRUE;
10086 }
10087 
10088 int
10090 {
10091  rb_encoding *enc;
10092  const char *ptr;
10093  long len;
10095 
10096  if (resenc == NULL) resenc = rb_default_external_encoding();
10097  enc = STR_ENC_GET(sym);
10098  ptr = RSTRING_PTR(sym);
10099  len = RSTRING_LEN(sym);
10100  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
10101  !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
10102  return FALSE;
10103  }
10104  return TRUE;
10105 }
10106 
10107 VALUE
10109 {
10110  rb_encoding *enc;
10111  const char *ptr;
10112  long len;
10113  rb_encoding *resenc;
10114 
10115  Check_Type(str, T_STRING);
10116  resenc = rb_default_internal_encoding();
10117  if (resenc == NULL) resenc = rb_default_external_encoding();
10118  enc = STR_ENC_GET(str);
10119  ptr = RSTRING_PTR(str);
10120  len = RSTRING_LEN(str);
10121  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
10122  !sym_printable(ptr, ptr + len, enc)) {
10123  return rb_str_inspect(str);
10124  }
10125  return str;
10126 }
10127 
10128 VALUE
10130 {
10131  return rb_str_quote_unprintable(rb_id2str(id));
10132 }
10133 
10134 /*
10135  * call-seq:
10136  * sym.inspect -> string
10137  *
10138  * Returns the representation of <i>sym</i> as a symbol literal.
10139  *
10140  * :fred.inspect #=> ":fred"
10141  */
10142 
10143 static VALUE
10144 sym_inspect(VALUE sym)
10145 {
10146  VALUE str = rb_sym2str(sym);
10147  const char *ptr;
10148  long len;
10149  char *dest;
10150 
10151  if (!rb_str_symname_p(str)) {
10152  str = rb_str_inspect(str);
10153  len = RSTRING_LEN(str);
10154  rb_str_resize(str, len + 1);
10155  dest = RSTRING_PTR(str);
10156  memmove(dest + 1, dest, len);
10157  }
10158  else {
10159  rb_encoding *enc = STR_ENC_GET(str);
10160  RSTRING_GETMEM(str, ptr, len);
10161  str = rb_enc_str_new(0, len + 1, enc);
10162  dest = RSTRING_PTR(str);
10163  memcpy(dest + 1, ptr, len);
10164  }
10165  dest[0] = ':';
10166  return str;
10167 }
10168 
10169 
10170 /*
10171  * call-seq:
10172  * sym.id2name -> string
10173  * sym.to_s -> string
10174  *
10175  * Returns the name or string corresponding to <i>sym</i>.
10176  *
10177  * :fred.id2name #=> "fred"
10178  * :ginger.to_s #=> "ginger"
10179  */
10180 
10181 
10182 VALUE
10184 {
10185  return str_new_shared(rb_cString, rb_sym2str(sym));
10186 }
10187 
10188 
10189 /*
10190  * call-seq:
10191  * sym.to_sym -> sym
10192  * sym.intern -> sym
10193  *
10194  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
10195  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
10196  * in this case.
10197  */
10198 
10199 static VALUE
10200 sym_to_sym(VALUE sym)
10201 {
10202  return sym;
10203 }
10204 
10205 VALUE
10206 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc)
10207 {
10208  VALUE obj;
10209 
10210  if (argc < 1) {
10211  rb_raise(rb_eArgError, "no receiver given");
10212  }
10213  obj = argv[0];
10214  return rb_funcall_with_block(obj, mid, argc - 1, argv + 1, passed_proc);
10215 }
10216 
10217 #if 0
10218 /*
10219  * call-seq:
10220  * sym.to_proc
10221  *
10222  * Returns a _Proc_ object which respond to the given method by _sym_.
10223  *
10224  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
10225  */
10226 
10227 VALUE
10228 rb_sym_to_proc(VALUE sym)
10229 {
10230 }
10231 #endif
10232 
10233 /*
10234  * call-seq:
10235  *
10236  * sym.succ
10237  *
10238  * Same as <code>sym.to_s.succ.intern</code>.
10239  */
10240 
10241 static VALUE
10242 sym_succ(VALUE sym)
10243 {
10244  return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
10245 }
10246 
10247 /*
10248  * call-seq:
10249  *
10250  * symbol <=> other_symbol -> -1, 0, +1, or nil
10251  *
10252  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
10253  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
10254  * less than, equal to, or greater than +other_symbol+.
10255  *
10256  * +nil+ is returned if the two values are incomparable.
10257  *
10258  * See String#<=> for more information.
10259  */
10260 
10261 static VALUE
10262 sym_cmp(VALUE sym, VALUE other)
10263 {
10264  if (!SYMBOL_P(other)) {
10265  return Qnil;
10266  }
10267  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
10268 }
10269 
10270 /*
10271  * call-seq:
10272  * sym.casecmp(other_symbol) -> -1, 0, +1, or nil
10273  *
10274  * Case-insensitive version of <code>Symbol#<=></code>.
10275  * Currently, case-insensitivity only works on characters A-Z/a-z,
10276  * not all of Unicode. This is different from Symbol#casecmp?.
10277  *
10278  * :aBcDeF.casecmp(:abcde) #=> 1
10279  * :aBcDeF.casecmp(:abcdef) #=> 0
10280  * :aBcDeF.casecmp(:abcdefg) #=> -1
10281  * :abcdef.casecmp(:ABCDEF) #=> 0
10282  *
10283  * +nil+ is returned if the two symbols have incompatible encodings,
10284  * or if +other_symbol+ is not a symbol.
10285  *
10286  * :foo.casecmp(2) #=> nil
10287  * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}") #=> nil
10288  */
10289 
10290 static VALUE
10291 sym_casecmp(VALUE sym, VALUE other)
10292 {
10293  if (!SYMBOL_P(other)) {
10294  return Qnil;
10295  }
10296  return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
10297 }
10298 
10299 /*
10300  * call-seq:
10301  * sym.casecmp?(other_symbol) -> true, false, or nil
10302  *
10303  * Returns +true+ if +sym+ and +other_symbol+ are equal after
10304  * Unicode case folding, +false+ if they are not equal.
10305  *
10306  * :aBcDeF.casecmp?(:abcde) #=> false
10307  * :aBcDeF.casecmp?(:abcdef) #=> true
10308  * :aBcDeF.casecmp?(:abcdefg) #=> false
10309  * :abcdef.casecmp?(:ABCDEF) #=> true
10310  * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
10311  *
10312  * +nil+ is returned if the two symbols have incompatible encodings,
10313  * or if +other_symbol+ is not a symbol.
10314  *
10315  * :foo.casecmp?(2) #=> nil
10316  * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}") #=> nil
10317  */
10318 
10319 static VALUE
10320 sym_casecmp_p(VALUE sym, VALUE other)
10321 {
10322  if (!SYMBOL_P(other)) {
10323  return Qnil;
10324  }
10325  return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
10326 }
10327 
10328 /*
10329  * call-seq:
10330  * sym =~ obj -> integer or nil
10331  *
10332  * Returns <code>sym.to_s =~ obj</code>.
10333  */
10334 
10335 static VALUE
10336 sym_match(VALUE sym, VALUE other)
10337 {
10338  return rb_str_match(rb_sym2str(sym), other);
10339 }
10340 
10341 /*
10342  * call-seq:
10343  * sym.match(pattern) -> matchdata or nil
10344  * sym.match(pattern, pos) -> matchdata or nil
10345  *
10346  * Returns <code>sym.to_s.match</code>.
10347  */
10348 
10349 static VALUE
10350 sym_match_m(int argc, VALUE *argv, VALUE sym)
10351 {
10352  return rb_str_match_m(argc, argv, rb_sym2str(sym));
10353 }
10354 
10355 /*
10356  * call-seq:
10357  * sym.match?(pattern) -> true or false
10358  * sym.match?(pattern, pos) -> true or false
10359  *
10360  * Returns <code>sym.to_s.match?</code>.
10361  */
10362 
10363 static VALUE
10364 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
10365 {
10366  return rb_str_match_m_p(argc, argv, sym);
10367 }
10368 
10369 /*
10370  * call-seq:
10371  * sym[idx] -> char
10372  * sym[b, n] -> string
10373  * sym.slice(idx) -> char
10374  * sym.slice(b, n) -> string
10375  *
10376  * Returns <code>sym.to_s[]</code>.
10377  */
10378 
10379 static VALUE
10380 sym_aref(int argc, VALUE *argv, VALUE sym)
10381 {
10382  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
10383 }
10384 
10385 /*
10386  * call-seq:
10387  * sym.length -> integer
10388  * sym.size -> integer
10389  *
10390  * Same as <code>sym.to_s.length</code>.
10391  */
10392 
10393 static VALUE
10394 sym_length(VALUE sym)
10395 {
10396  return rb_str_length(rb_sym2str(sym));
10397 }
10398 
10399 /*
10400  * call-seq:
10401  * sym.empty? -> true or false
10402  *
10403  * Returns whether _sym_ is :"" or not.
10404  */
10405 
10406 static VALUE
10407 sym_empty(VALUE sym)
10408 {
10409  return rb_str_empty(rb_sym2str(sym));
10410 }
10411 
10412 /*
10413  * call-seq:
10414  * sym.upcase -> symbol
10415  * sym.upcase([options]) -> symbol
10416  *
10417  * Same as <code>sym.to_s.upcase.intern</code>.
10418  */
10419 
10420 static VALUE
10421 sym_upcase(int argc, VALUE *argv, VALUE sym)
10422 {
10423  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
10424 }
10425 
10426 /*
10427  * call-seq:
10428  * sym.downcase -> symbol
10429  * sym.downcase([options]) -> symbol
10430  *
10431  * Same as <code>sym.to_s.downcase.intern</code>.
10432  */
10433 
10434 static VALUE
10435 sym_downcase(int argc, VALUE *argv, VALUE sym)
10436 {
10437  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
10438 }
10439 
10440 /*
10441  * call-seq:
10442  * sym.capitalize -> symbol
10443  * sym.capitalize([options]) -> symbol
10444  *
10445  * Same as <code>sym.to_s.capitalize.intern</code>.
10446  */
10447 
10448 static VALUE
10449 sym_capitalize(int argc, VALUE *argv, VALUE sym)
10450 {
10451  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
10452 }
10453 
10454 /*
10455  * call-seq:
10456  * sym.swapcase -> symbol
10457  * sym.swapcase([options]) -> symbol
10458  *
10459  * Same as <code>sym.to_s.swapcase.intern</code>.
10460  */
10461 
10462 static VALUE
10463 sym_swapcase(int argc, VALUE *argv, VALUE sym)
10464 {
10465  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
10466 }
10467 
10468 /*
10469  * call-seq:
10470  * sym.encoding -> encoding
10471  *
10472  * Returns the Encoding object that represents the encoding of _sym_.
10473  */
10474 
10475 static VALUE
10476 sym_encoding(VALUE sym)
10477 {
10478  return rb_obj_encoding(rb_sym2str(sym));
10479 }
10480 
10481 static VALUE
10482 string_for_symbol(VALUE name)
10483 {
10484  if (!RB_TYPE_P(name, T_STRING)) {
10485  VALUE tmp = rb_check_string_type(name);
10486  if (NIL_P(tmp)) {
10487  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
10488  name);
10489  }
10490  name = tmp;
10491  }
10492  return name;
10493 }
10494 
10495 ID
10497 {
10498  if (SYMBOL_P(name)) {
10499  return SYM2ID(name);
10500  }
10501  name = string_for_symbol(name);
10502  return rb_intern_str(name);
10503 }
10504 
10505 VALUE
10507 {
10508  if (SYMBOL_P(name)) {
10509  return name;
10510  }
10511  name = string_for_symbol(name);
10512  return rb_str_intern(name);
10513 }
10514 
10515 /*
10516  * A <code>String</code> object holds and manipulates an arbitrary sequence of
10517  * bytes, typically representing characters. String objects may be created
10518  * using <code>String::new</code> or as literals.
10519  *
10520  * Because of aliasing issues, users of strings should be aware of the methods
10521  * that modify the contents of a <code>String</code> object. Typically,
10522  * methods with names ending in ``!'' modify their receiver, while those
10523  * without a ``!'' return a new <code>String</code>. However, there are
10524  * exceptions, such as <code>String#[]=</code>.
10525  *
10526  */
10527 
10528 void
10530 {
10531 #undef rb_intern
10532 #define rb_intern(str) rb_intern_const(str)
10533 
10534  rb_cString = rb_define_class("String", rb_cObject);
10536  st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
10538  rb_define_alloc_func(rb_cString, empty_str_alloc);
10539  rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
10540  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
10541  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
10542  rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
10545  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
10546  rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
10547  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
10548  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
10551  rb_define_method(rb_cString, "%", rb_str_format_m, 1);
10552  rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
10553  rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
10554  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
10555  rb_define_method(rb_cString, "length", rb_str_length, 0);
10557  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
10558  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
10559  rb_define_method(rb_cString, "=~", rb_str_match, 1);
10560  rb_define_method(rb_cString, "match", rb_str_match_m, -1);
10561  rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
10563  rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
10565  rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
10566  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
10567  rb_define_method(rb_cString, "index", rb_str_index_m, -1);
10568  rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
10569  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
10570  rb_define_method(rb_cString, "clear", rb_str_clear, 0);
10571  rb_define_method(rb_cString, "chr", rb_str_chr, 0);
10572  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
10573  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
10574  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
10575  rb_define_method(rb_cString, "scrub", str_scrub, -1);
10576  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
10577  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
10578  rb_define_method(rb_cString, "+@", str_uplus, 0);
10579  rb_define_method(rb_cString, "-@", str_uminus, 0);
10580 
10581  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
10582  rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
10583  rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
10584  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
10585  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
10587 
10588  sym_ascii = ID2SYM(rb_intern("ascii"));
10589  sym_turkic = ID2SYM(rb_intern("turkic"));
10590  sym_lithuanian = ID2SYM(rb_intern("lithuanian"));
10591  sym_fold = ID2SYM(rb_intern("fold"));
10592 
10593  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
10594  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
10595  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
10596  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
10597 
10598  rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
10599  rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
10600  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
10601  rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
10602 
10603  rb_define_method(rb_cString, "hex", rb_str_hex, 0);
10604  rb_define_method(rb_cString, "oct", rb_str_oct, 0);
10605  rb_define_method(rb_cString, "split", rb_str_split_m, -1);
10606  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
10607  rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
10608  rb_define_method(rb_cString, "chars", rb_str_chars, 0);
10609  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
10610  rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
10611  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
10612  rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
10613  rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
10615  rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
10616  rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
10617  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
10618  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
10620 
10621  rb_define_method(rb_cString, "include?", rb_str_include, 1);
10622  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
10623  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
10624 
10625  rb_define_method(rb_cString, "scan", rb_str_scan, 1);
10626 
10627  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
10628  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
10629  rb_define_method(rb_cString, "center", rb_str_center, -1);
10630 
10631  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
10632  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
10633  rb_define_method(rb_cString, "chop", rb_str_chop, 0);
10634  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
10635  rb_define_method(rb_cString, "strip", rb_str_strip, 0);
10636  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
10637  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
10638  rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
10639  rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
10640 
10641  rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
10642  rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
10643  rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
10644  rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
10645  rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
10646  rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
10647  rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
10648  rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
10649  rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
10650 
10651  rb_define_method(rb_cString, "tr", rb_str_tr, 2);
10652  rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
10653  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
10654  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
10655  rb_define_method(rb_cString, "count", rb_str_count, -1);
10656 
10657  rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
10658  rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
10659  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
10660  rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
10661 
10662  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
10663  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
10664  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
10665  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
10666  rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
10667 
10668  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
10669 
10670  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
10671  rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
10672 
10673  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
10674  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
10675 
10676  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
10677  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
10678  rb_define_method(rb_cString, "b", rb_str_b, 0);
10679  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
10680  rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
10681 
10682  /* define UnicodeNormalize module here so that we don't have to look it up */
10683  mUnicodeNormalize = rb_define_module("UnicodeNormalize");
10684  id_normalize = rb_intern("normalize");
10685  id_normalized_p = rb_intern("normalized?");
10686 
10687  rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
10688  rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
10689  rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
10690 
10691  rb_fs = Qnil;
10692  rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
10693  rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
10694  rb_gc_register_address(&rb_fs);
10695 
10696  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
10700  rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in symbol.c */
10701 
10704  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
10706  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
10707  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
10708  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
10709  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
10710  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
10711  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
10712 
10713  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
10714  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
10715  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
10716  rb_define_method(rb_cSymbol, "=~", sym_match, 1);
10717 
10718  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
10719  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
10720  rb_define_method(rb_cSymbol, "length", sym_length, 0);
10721  rb_define_method(rb_cSymbol, "size", sym_length, 0);
10722  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
10723  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
10724  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
10725 
10726  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
10727  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
10728  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
10729  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
10730 
10731  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
10732 }
#define RBASIC_CLEAR_CLASS(obj)
Definition: internal.h:1469
int rb_enc_str_asciionly_p(VALUE str)
Definition: string.c:641
VALUE rb_utf8_str_new(const char *ptr, long len)
Definition: string.c:751
VALUE rb_str_resize(VALUE str, long len)
Definition: string.c:2644
#define CASEMAP_DEBUG
Definition: string.c:6120
#define ENCINDEX_US_ASCII
Definition: encindex.h:44
#define ISDIGIT(c)
Definition: ruby.h:2150
VALUE rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
Definition: string.c:1042
Definition: string.c:6519
int rb_enc_codelen(int c, rb_encoding *enc)
Definition: encoding.c:1077
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:773
Definition: st.h:99
int rb_reg_backref_number(VALUE match, VALUE backref)
Definition: re.c:1138
#define BARE_STRING_P(str)
Definition: string.c:259
#define ONIGENC_CODE_TO_MBCLEN(enc, code)
Definition: onigmo.h:367
#define is_broken_string(str)
Definition: internal.h:1655
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:185
void rb_warn(const char *fmt,...)
Definition: error.c:246
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Definition: encoding.c:879
#define FL_EXIVAR
Definition: ruby.h:1215
VALUE rb_ary_pop(VALUE ary)
Definition: array.c:968
void rb_bug(const char *fmt,...)
Definition: error.c:521
rb_econv_result_t
Definition: encoding.h:291
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:186
#define RESIZE_CAPA(str, capacity)
Definition: string.c:129
#define RARRAY_LEN(a)
Definition: ruby.h:1019
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:202
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Definition: string.c:836
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:978
#define FALSE
Definition: nkf.h:174
#define RSTRING(obj)
Definition: ruby.h:1201
#define rb_intern(str)
size_t strlen(const char *)
#define INT2NUM(x)
Definition: ruby.h:1538
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:9645
#define CHECK_IF_ASCII(c)
void rb_backref_set(VALUE)
Definition: vm.c:1235
Definition: st.h:79
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Definition: encoding.h:109
RUBY_FUNC_EXPORTED VALUE rb_str_locktmp_ensure(VALUE str, VALUE(*func)(VALUE), VALUE arg)
Definition: string.c:2620
Definition: st.h:99
VALUE rb_str_equal(VALUE str1, VALUE str2)
Definition: string.c:3214
char * rb_str_to_cstr(VALUE str)
Definition: string.c:2216
VALUE rb_str_new_static(const char *ptr, long len)
Definition: string.c:830
#define NUM2INT(x)
Definition: ruby.h:684
VALUE rb_str_eql(VALUE str1, VALUE str2)
Definition: string.c:3234
VALUE rb_locale_str_new_cstr(const char *ptr)
Definition: string.c:1073
#define ENCINDEX_UTF_16LE
Definition: encindex.h:46
VALUE rb_sym_to_s(VALUE sym)
Definition: string.c:10183
#define ascii_isspace(c)
Definition: string.c:7336
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
Definition: string.c:1008
void rb_undef_alloc_func(VALUE)
Definition: vm_method.c:675
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1716
VALUE rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
Definition: string.c:368
#define sym_equal
Definition: string.c:10070
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:835
VALUE rb_str_new_frozen(VALUE orig)
Definition: string.c:1158
#define FL_SET_RAW(x, f)
Definition: ruby.h:1287
st_index_t rb_str_hash(VALUE str)
Definition: string.c:3094
#define FL_TAINT
Definition: ruby.h:1213
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
Definition: string.c:2853
#define CLASS_OF(v)
Definition: ruby.h:453
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2284
VALUE rb_locale_str_new(const char *ptr, long len)
Definition: string.c:1067
#define st_foreach
Definition: regint.h:186
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Definition: class.c:1847
Definition: id.h:88
#define FIXNUM_MAX
Definition: ruby.h:228
#define Qtrue
Definition: ruby.h:437
VALUE rb_reg_check_preprocess(VALUE)
Definition: re.c:2672
void rb_str_set_len(VALUE str, long len)
Definition: string.c:2627
#define is_ascii_string(str)
Definition: internal.h:1654
unsigned char * USTR
Definition: string.c:6517
ONIG_EXTERN int onig_new(OnigRegex *, const OnigUChar *pattern, const OnigUChar *pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType *syntax, OnigErrorInfo *einfo)
#define ONIGERR_INVALID_CODE_POINT_VALUE
Definition: onigmo.h:689
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:106
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:848
#define rb_id2str(id)
Definition: vm_backtrace.c:29
Definition: id.h:91
Definition: st.h:99
char * pend
Definition: string.c:6522
#define OBJ_FREEZE(x)
Definition: ruby.h:1306
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:381
void Init_String(void)
Definition: string.c:10529
#define ZALLOC_N(type, n)
Definition: ruby.h:1589
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
Definition: string.c:2162
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:246
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:132
#define ENC_CODERANGE_CLEAR(obj)
Definition: encoding.h:107
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1698
VALUE rb_eEncCompatError
Definition: error.c:808
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, s, end)
Definition: onigmo.h:334
VALUE rb_str_escape(VALUE str)
Definition: string.c:5737
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:117
bool rb_reg_start_with_p(VALUE re, VALUE str)
Definition: re.c:1584
#define rb_check_arity
Definition: intern.h:298
#define UNREACHABLE
Definition: ruby.h:46
VALUE rb_reg_match(VALUE, VALUE)
Definition: re.c:3143
long rb_memsearch(const void *, long, const void *, long, rb_encoding *)
Definition: re.c:244
VALUE rb_check_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:3022
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1510
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:924
#define ENCINDEX_ASCII
Definition: encindex.h:42
VALUE rb_reg_regsub(VALUE, VALUE, struct re_registers *, VALUE)
Definition: re.c:3739
#define SYM2ID(x)
Definition: ruby.h:384
RUBY_EXTERN char * crypt(const char *, const char *)
st_index_t rb_memhash(const void *ptr, long len)
Definition: random.c:1512
int rb_usascii_encindex(void)
Definition: encoding.c:1344
VALUE rb_str_split(VALUE str, const char *sep0)
Definition: string.c:7602
long capa
Definition: ruby.h:961
struct RBasic basic
Definition: ruby.h:955
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Definition: encoding.c:962
VALUE rb_str_export_to_enc(VALUE str, rb_encoding *enc)
Definition: string.c:1103
#define ONIGENC_CTYPE_ALPHA
Definition: onigmo.h:295
void ruby_sized_xfree(void *x, size_t size)
Definition: gc.c:8077
#define ENCINDEX_UTF_32
Definition: encindex.h:50
VALUE rb_funcall(VALUE, ID, int,...)
Calls a method.
Definition: vm_eval.c:774
#define str_buf_cat2(str, ptr)
Definition: string.c:2743
#define ENCINDEX_UTF_16BE
Definition: encindex.h:45
VALUE rb_filesystem_str_new(const char *ptr, long len)
Definition: string.c:1079
VALUE rb_str_export(VALUE str)
Definition: string.c:1091
#define RGENGC_WB_PROTECTED_STRING
Definition: ruby.h:780
unsigned int OnigCaseFoldType
Definition: onigmo.h:95
#define RBASIC_SET_CLASS(obj, cls)
Definition: internal.h:1471
VALUE rb_backref_get(void)
Definition: vm.c:1229
ptrdiff_t OnigPosition
Definition: onigmo.h:83
VALUE rb_str_freeze(VALUE str)
Definition: string.c:2549
#define ENCODING_GET_INLINED(obj)
Definition: encoding.h:57
long rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
Definition: string.c:1700
#define STR_IS_SHARED_M
Definition: string.c:84
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
Definition: encoding.c:1056
#define Check_Type(v, t)
Definition: ruby.h:562
long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:1709
char * p
Definition: string.c:6522
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Definition: ruby.h:984
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:854
VALUE rb_funcall_with_block(VALUE, ID, int, const VALUE *, VALUE)
Definition: vm_eval.c:833
VALUE rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
Definition: string.c:936
int rb_objspace_garbage_object_p(VALUE obj)
Definition: gc.c:3072
char * rb_string_value_ptr(volatile VALUE *ptr)
Definition: string.c:2118
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Definition: string.c:842
#define RB_GC_GUARD(v)
Definition: ruby.h:552
VALUE rb_str_concat_literals(size_t num, const VALUE *strary)
Definition: string.c:2909
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
VALUE rb_obj_alloc(VALUE)
Allocates an instance of klass.
Definition: object.c:2121
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1008
#define DATA_PTR(dta)
Definition: ruby.h:1106
#define ENC_CODERANGE_MASK
Definition: encoding.h:98
void rb_include_module(VALUE klass, VALUE module)
Definition: class.c:864
#define FL_TEST_RAW(x, f)
Definition: ruby.h:1281
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Definition: hash.c:853
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc)
Definition: string.c:10206
#define FL_UNSET(x, f)
Definition: ruby.h:1290
st_data_t st_index_t
Definition: st.h:50
VALUE rb_range_beg_len(VALUE, long *, long *, long, int)
Definition: range.c:1003
#define st_delete
Definition: regint.h:182
#define DEFAULT_REPLACE_CHAR(str)
void rb_gc_register_address(VALUE *addr)
Definition: gc.c:6241
RUBY_FUNC_EXPORTED size_t rb_str_memsize(VALUE str)
Definition: string.c:1338
int st_update(st_table *table, st_data_t key, st_update_callback_func *func, st_data_t arg)
Definition: st.c:1393
VALUE rb_str_new(const char *ptr, long len)
Definition: string.c:737
#define rb_enc_mbmaxlen(enc)
Definition: encoding.h:175
ID rb_id_encoding(void)
Definition: encoding.c:753
unsigned int last
Definition: nkf.c:4311
#define STR_SET_NOEMBED(str)
Definition: string.c:89
VALUE rb_ensure(VALUE(*b_proc)(ANYARGS), VALUE data1, VALUE(*e_proc)(ANYARGS), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:1035
void rb_gc_force_recycle(VALUE obj)
Definition: gc.c:6175
#define FIXNUM_P(f)
Definition: ruby.h:365
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1320
VALUE rb_str_export_locale(VALUE str)
Definition: string.c:1097
#define BEG(no)
Definition: string.c:23
#define FL_TEST(x, f)
Definition: ruby.h:1282
VALUE rb_str_new_shared(VALUE str)
Definition: string.c:1149
void rb_undef_method(VALUE klass, const char *name)
Definition: class.c:1533
#define CHAR_ESC_LEN
Definition: string.c:5701
#define ENCINDEX_UTF_8
Definition: encindex.h:43
VALUE rb_cString
Definition: string.c:66
#define ONIGENC_CASE_MODIFIED
Definition: onigmo.h:119
#define ENC_CODERANGE_7BIT
Definition: encoding.h:100
const char * rb_obj_classname(VALUE)
Definition: variable.c:459
int rb_enc_symname_p(const char *, rb_encoding *)
Definition: symbol.c:196
VALUE rb_enc_sprintf(rb_encoding *enc, const char *format,...)
Definition: sprintf.c:1433
#define rb_ary_new2
Definition: intern.h:90
RUBY_EXTERN void * memmove(void *, const void *, size_t)
Definition: memmove.c:7
#define ONIGENC_CASE_FOLD_LITHUANIAN
Definition: onigmo.h:124
#define ONIGENC_CODE_TO_MBC_MAXLEN
Definition: onigmo.h:289
VALUE rb_eArgError
Definition: error.c:802
#define sym(x)
Definition: date_core.c:3721
VALUE rb_str_append(VALUE str, VALUE str2)
Definition: string.c:2900
char * crypt_r(const char *key, const char *setting, struct crypt_data *data)
Definition: crypt.c:396
VALUE rb_str_buf_cat(VALUE, const char *, long)
RUBY_SYMBOL_EXPORT_BEGIN typedef unsigned long st_data_t
Definition: st.h:22
VALUE rb_fstring_enc_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:394
#define NEWOBJ_OF(obj, type, klass, flags)
Definition: ruby.h:754
union RString::@67 as
#define ISALPHA(c)
Definition: ruby.h:2149
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Definition: string.c:3104
#define OBJ_TAINTED_RAW(x)
Definition: ruby.h:1295
#define STR_EMBEDDABLE_P(len, termlen)
Definition: string.c:175
#define rb_enc_isctype(c, t, enc)
Definition: encoding.h:223
#define RBASIC_SET_CLASS_RAW(obj, cls)
Definition: internal.h:1470
VALUE rb_obj_class(VALUE)
call-seq: obj.class -> class
Definition: object.c:277
#define RB_TYPE_P(obj, type)
Definition: ruby.h:527
#define RB_DEBUG_COUNTER_INC_IF(type, cond)
int rb_enc_str_coderange(VALUE str)
Definition: string.c:621
#define MEMZERO(p, type, n)
Definition: ruby.h:1660
Definition: ruby.h:954
VALUE rb_str_plus(VALUE str1, VALUE str2)
Definition: string.c:1854
VALUE rb_require(const char *)
Definition: load.c:1061
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1425
#define UNLIKELY(x)
Definition: internal.h:43
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:126
VALUE rb_eRangeError
Definition: error.c:805
#define STR_NOFREE
Definition: string.c:86
VALUE rb_mComparable
Definition: compar.c:15
neighbor_char
Definition: string.c:3819
#define WANTARRAY(m, size)
Definition: string.c:7625
#define rb_intern_str(string)
Definition: generator.h:16
VALUE rb_equal(VALUE, VALUE)
call-seq: obj === other -> true or false
Definition: object.c:126
unsigned int now
Definition: string.c:6521
#define ALLOC_N(type, n)
Definition: ruby.h:1587
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:1616
VALUE rb_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:2979
#define val
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1002
RUBY_EXTERN VALUE rb_cObject
Definition: ruby.h:1893
#define TERM_LEN(str)
Definition: string.c:120
#define rb_enc_isascii(c, enc)
Definition: encoding.h:224
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Definition: bignum.c:4226
#define MBCLEN_NEEDMORE_P(ret)
Definition: encoding.h:188
VALUE rb_str_length(VALUE str)
Definition: string.c:1803
#define RSTRING_END(str)
Definition: ruby.h:979
VALUE rb_str_cat_cstr(VALUE str, const char *ptr)
Definition: string.c:2756
VALUE rb_str_cat2(VALUE, const char *)
#define FL_SET(x, f)
Definition: ruby.h:1288
int rb_str_symname_p(VALUE sym)
Definition: string.c:10089
VALUE rb_ary_new(void)
Definition: array.c:499
VALUE rb_str_new_cstr(const char *ptr)
Definition: string.c:771
VALUE rb_str_buf_cat2(VALUE, const char *)
#define dp(v)
Definition: vm_debug.h:21
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2575
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1314
#define UINT2NUM(x)
Definition: ruby.h:1539
#define STR_BUF_MIN_SIZE
Definition: string.c:1279
#define STR_SET_EMBED(str)
Definition: string.c:93
VALUE rb_any_to_s(VALUE)
call-seq: obj.to_s -> string
Definition: object.c:631
VALUE rb_eIndexError
Definition: error.c:803
#define snprintf
Definition: subst.h:6
#define NIL_P(v)
Definition: ruby.h:451
#define ISASCII(c)
Definition: ruby.h:2142
OnigUChar space[1]
Definition: string.c:6128
#define OBJ_FROZEN_RAW(x)
Definition: ruby.h:1303
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:646
#define rb_enc_step_back(s, p, e, n, enc)
Definition: encoding.h:218
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:103
void rb_enc_set_index(VALUE obj, int idx)
Definition: encoding.c:818
VALUE rb_str_concat(VALUE str1, VALUE str2)
Definition: string.c:2999
register int hval
Definition: zonetab.h:82
#define TOUPPER(c)
Definition: ruby.h:2153
#define offsetof(p_type, field)
Definition: addrinfo.h:186
#define ENUM_ELEM(ary, e)
Definition: string.c:7641
#define END(no)
Definition: string.c:24
st_table * rb_vm_fstring_table(void)
Definition: vm.c:3211
RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen,(str))
Definition: string.c:2596
#define STR_ENC_GET(str)
Definition: string.c:164
#define TYPE(x)
Definition: ruby.h:521
int argc
Definition: ruby.c:187
VALUE rb_str_scrub(VALUE str, VALUE repl)
Definition: string.c:9638
unsigned char OnigUChar
Definition: onigmo.h:79
char ary[RSTRING_EMBED_LEN_MAX+1]
Definition: ruby.h:965
#define Qfalse
Definition: ruby.h:436
long rb_str_offset(VALUE str, long pos)
Definition: string.c:2348
#define ENCINDEX_UTF_16
Definition: encindex.h:49
#define STR_SET_EMBED_LEN(str, n)
Definition: string.c:94
#define ALLOCA_N(type, n)
Definition: ruby.h:1593
#define range(low, item, hi)
Definition: date_strftime.c:21
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:99
#define LONG_MAX
Definition: ruby.h:189
#define rb_enc_isprint(c, enc)
Definition: encoding.h:230
#define RUBY_FUNC_EXPORTED
Definition: defines.h:263
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1661
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:102
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:826
#define rb_str_index(str, sub, offset)
Definition: string.c:3416
#define OBJ_FREEZE_RAW(x)
Definition: ruby.h:1305
int err
Definition: win32.c:135
#define rb_enc_codepoint(p, e, enc)
Definition: encoding.h:201
void rb_str_update(VALUE str, long beg, long len, VALUE val)
Definition: string.c:4545
#define rb_enc_mbminlen(enc)
Definition: encoding.h:174
unsigned int max
Definition: string.c:6521
#define STR_SHARED_P(s)
Definition: internal.h:1653
VALUE rb_utf8_str_new_cstr(const char *ptr)
Definition: string.c:786
#define ENC_CODERANGE_VALID
Definition: encoding.h:101
#define numberof(array)
Definition: etc.c:618
#define ALLOC(type)
Definition: ruby.h:1588
#define RUBY_DTRACE_CREATE_HOOK(name, arg)
Definition: internal.h:1932
long rb_str_sublen(VALUE str, long pos)
Definition: string.c:2395
RUBY_FUNC_EXPORTED VALUE rb_fstring(VALUE str)
Definition: string.c:306
VALUE rb_str_times(VALUE str, VALUE times)
Definition: string.c:1896
VALUE rb_str_tmp_frozen_acquire(VALUE orig)
Definition: string.c:1170
void rb_sys_fail(const char *mesg)
Definition: error.c:2403
struct mapping_buffer mapping_buffer
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1032
void rb_str_modify_expand(VALUE str, long expand)
Definition: string.c:2054
#define sub(x, y)
Definition: date_strftime.c:24
#define RSTRING_LEN(str)
Definition: ruby.h:971
VALUE rb_yield(VALUE)
Definition: vm_eval.c:973
VALUE rb_obj_as_string(VALUE obj)
Definition: string.c:1410
#define RARRAY_CONST_PTR(a)
Definition: ruby.h:1021
VALUE rb_str_subseq(VALUE str, long beg, long len)
Definition: string.c:2406
#define REALLOC_N(var, type, n)
Definition: ruby.h:1591
char * rb_string_value_cstr(volatile VALUE *ptr)
Definition: string.c:2223
#define RUBY_MAX_CHAR_LEN
Definition: string.c:83
int errno
long rb_reg_search0(VALUE, VALUE, long, int, int)
Definition: re.c:1489
#define TRUE
Definition: nkf.h:175
VALUE rb_obj_freeze(VALUE)
call-seq: obj.freeze -> obj
Definition: object.c:1331
int(* case_map)(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: onigmo.h:177
VALUE rb_str_format(int, const VALUE *, VALUE)
Definition: sprintf.c:464
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1020
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:525
#define STR_TMPLOCK
Definition: string.c:85
#define rb_enc_name(enc)
Definition: encoding.h:171
#define rb_strlen_lit(str)
Definition: intern.h:845
#define ONIGENC_CASE_UPCASE
Definition: onigmo.h:113
VALUE rb_hash_new(void)
Definition: hash.c:424
void ruby_xfree(void *x)
Definition: gc.c:8085
#define ENCODING_MASK
Definition: encoding.h:38
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1908
long rb_reg_search(VALUE, VALUE, long, int)
Definition: re.c:1578
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:722
#define ONIGENC_CASE_FOLD
Definition: onigmo.h:120
int rb_str_cmp(VALUE str1, VALUE str2)
Definition: string.c:3159
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4309
#define PRIsVALUE
Definition: ruby.h:135
size_t capa
Definition: string.c:6125
unsigned long ID
Definition: ruby.h:86
VALUE rb_str_buf_new_cstr(const char *ptr)
Definition: string.c:1298
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1335
#define FL_ABLE(x)
Definition: ruby.h:1280
#define Qnil
Definition: ruby.h:438
unsigned int uintptr_t
Definition: win32.h:106
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:615
VALUE rb_sym_to_proc(VALUE sym)
Definition: proc.c:1198
const char * name
Definition: onigmo.h:162
#define BUILTIN_TYPE(x)
Definition: ruby.h:518
size_t used
Definition: string.c:6126
unsigned long VALUE
Definition: ruby.h:85
VALUE shared
Definition: ruby.h:962
VALUE rb_cSymbol
Definition: string.c:67
rb_encoding * rb_locale_encoding(void)
Definition: encoding.c:1370
VALUE rb_str_replace(VALUE str, VALUE str2)
Definition: string.c:5246
#define rb_enc_is_newline(p, end, enc)
Definition: encoding.h:221
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
Definition: intern.h:234
#define ONIGENC_CASE_DOWNCASE
Definition: onigmo.h:114
#define RBASIC(obj)
Definition: ruby.h:1197
#define STR_FAKESTR
Definition: string.c:87
#define ENC_CODERANGE_AND(a, b)
Definition: encoding.h:108
VALUE rb_eTypeError
Definition: error.c:801
int rb_utf8_encindex(void)
Definition: encoding.c:1329
void rb_str_shared_replace(VALUE str, VALUE str2)
Definition: string.c:1358
#define ENCODING_SET_INLINED(obj, i)
Definition: encoding.h:55
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:992
#define rb_ary_new3
Definition: intern.h:91
#define TERM_FILL(ptr, termlen)
Definition: string.c:121
#define ONIGENC_CASE_ASCII_ONLY
Definition: onigmo.h:125
#define ONIGENC_CASE_TITLECASE
Definition: onigmo.h:115
#define rb_enc_asciicompat(enc)
Definition: encoding.h:239
VALUE flags
Definition: ruby.h:855
VALUE rb_str_buf_cat_ascii(VALUE str, const char *ptr)
Definition: string.c:2860
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
VALUE rb_str_quote_unprintable(VALUE str)
Definition: string.c:10108
long rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:531
#define RARRAY_LENINT(ary)
Definition: ruby.h:1020
RUBY_EXTERN VALUE rb_rs
Definition: intern.h:516
void rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
Definition: string.c:1183
#define ENCODING_IS_ASCII8BIT(obj)
Definition: encoding.h:59
#define ONIG_OPTION_DEFAULT
Definition: onigmo.h:447
VALUE rb_cEncodingConverter
Definition: transcode.c:23
#define CHAR_BIT
Definition: ruby.h:196
VALUE rb_str_to_str(VALUE str)
Definition: string.c:1349
VALUE rb_str_chomp_string(VALUE str, VALUE rs)
Definition: string.c:8342
void rb_define_hooked_variable(const char *, VALUE *, VALUE(*)(ANYARGS), void(*)(ANYARGS))
Definition: variable.c:617
VALUE rb_string_value(volatile VALUE *ptr)
Definition: string.c:2107
VALUE rb_tainted_str_new_cstr(const char *ptr)
Definition: string.c:872
#define rb_funcallv
Definition: console.c:21
#define LONG2NUM(x)
Definition: ruby.h:1573
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1994
#define RUBY_ASSERT(expr)
Definition: ruby_assert.h:33
register unsigned int len
Definition: zonetab.h:51
#define StringValueCStr(v)
Definition: ruby.h:571
VALUE rb_usascii_str_new(const char *ptr, long len)
Definition: string.c:743
VALUE rb_str_buf_append(VALUE str, VALUE str2)
Definition: string.c:2884
#define RMATCH_REGS(obj)
Definition: re.h:52
RUBY_EXTERN VALUE rb_default_rs
Definition: intern.h:517
void rb_str_free(VALUE str)
Definition: string.c:1316
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Definition: string.c:1085
struct mapping_buffer * next
Definition: string.c:6127
#define RSTRING_PTR(str)
Definition: ruby.h:975
#define rb_enc_right_char_head(s, p, e, enc)
Definition: encoding.h:217
#define RB_OBJ_WRITE(a, slot, b)
Definition: eval_intern.h:175
#define ENCODING_GET(obj)
Definition: encoding.h:58
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:860
#define STR_SET_SHARED(str, shared_str)
Definition: string.c:152
#define STR_HEAP_PTR(str)
Definition: string.c:161
int size
Definition: encoding.c:57
char * rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
Definition: string.c:2320
#define f
#define INT2FIX(i)
Definition: ruby.h:232
#define UNLIMITED_ARGUMENTS
Definition: intern.h:44
char * rb_str_subpos(VALUE str, long beg, long *lenp)
Definition: string.c:2430
VALUE rb_str_unlocktmp(VALUE str)
Definition: string.c:2610
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition: string.c:854
#define CASE_MAPPING_ADDITIONAL_LENGTH
Definition: string.c:6118
#define MBCLEN_INVALID_P(ret)
Definition: encoding.h:187
#define RARRAY_AREF(a, i)
Definition: ruby.h:1033
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Definition: transcode.c:1442
double rb_str_to_dbl(VALUE, int)
Parses a string representation of a floating point number.
Definition: object.c:3298
#define STR_SET_LEN(str, n)
Definition: string.c:100
#define xmalloc
Definition: defines.h:183
#define RBASIC_CLASS(obj)
Definition: ruby.h:878
#define ONIGENC_MBCLEN_CHARFOUND_LEN(r)
Definition: onigmo.h:347
#define RESIZE_CAPA_TERM(str, capacity, termlen)
Definition: string.c:133
VALUE rb_eRuntimeError
Definition: error.c:800
int num_regs
Definition: onigmo.h:716
#define lesser(a, b)
Definition: string.c:3131
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:651
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:831
#define UNALIGNED_WORD_ACCESS
Definition: defines.h:354
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: onigmo.h:691
#define FL_WB_PROTECTED
Definition: ruby.h:1209
#define ENC_CODERANGE(obj)
Definition: encoding.h:104
VALUE rb_to_symbol(VALUE name)
Definition: string.c:10506
VALUE rb_str_cat(VALUE str, const char *ptr, long len)
Definition: string.c:2746
long rb_str_strlen(VALUE str)
Definition: string.c:1789
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Definition: string.c:885
#define LONG2FIX(i)
Definition: ruby.h:234
#define RTEST(v)
Definition: ruby.h:450
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
Definition: string.c:5704
void rb_warning(const char *fmt,...)
Definition: error.c:267
#define T_STRING
Definition: ruby.h:496
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: onigmo.h:122
VALUE rb_str_locktmp(VALUE)
#define PRIuSIZE
Definition: ruby.h:177
const struct st_hash_type rb_fstring_hash_type
Definition: string.c:254
VALUE rb_str_drop_bytes(VALUE str, long len)
Definition: string.c:4473
size_t rb_str_capacity(VALUE str)
Definition: string.c:675
VALUE rb_fstring_new(const char *ptr, long len)
Definition: string.c:374
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:794
#define OBJ_INFECT(x, s)
Definition: ruby.h:1302
rb_encoding * rb_filesystem_encoding(void)
Definition: encoding.c:1385
rb_encoding * rb_enc_get_from_index(int index)
Definition: encoding.c:628
void rb_str_setter(VALUE val, ID id, VALUE *var)
Definition: string.c:9469
#define OBJ_FROZEN(x)
Definition: ruby.h:1304
#define STR_SHARED
Definition: internal.h:1651
VALUE rb_str_tmp_new(long len)
Definition: string.c:1310
VALUE rb_obj_as_string_result(VALUE str, VALUE obj)
Definition: string.c:1422
VALUE rb_fs
Definition: string.c:436
#define ISPRINT(c)
Definition: ruby.h:2143
#define rb_enc_left_char_head(s, p, e, enc)
Definition: encoding.h:216
#define assert
Definition: ruby_assert.h:37
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos)
Definition: re.c:3303
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Definition: string.c:1002
void rb_backref_set_string(VALUE string, long pos, long len)
Definition: re.c:1300
#define rb_str_splice(str, beg, len, val)
Definition: string.c:4589
#define SHARABLE_SUBSTRING_P(beg, len, end)
Definition: string.c:170
#define FL_UNSET_RAW(x, f)
Definition: ruby.h:1289
#define RETURN_ENUMERATOR(obj, argc, argv)
Definition: intern.h:238
VALUE rb_fstring_cstr(const char *ptr)
Definition: string.c:388
#define RB_DEBUG_COUNTER_INC(type)
VALUE rb_str_substr(VALUE str, long beg, long len)
Definition: string.c:2517
ONIG_EXTERN const OnigSyntaxType * OnigDefaultSyntax
Definition: onigmo.h:515
void rb_must_asciicompat(VALUE str)
Definition: string.c:2098
const char * name
Definition: nkf.c:208
#define ID2SYM(x)
Definition: ruby.h:383
VALUE rb_sym_all_symbols(void)
Definition: symbol.c:806
int gen
Definition: string.c:6520
VALUE rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_encoding *from, int ecflags, VALUE ecopts)
Definition: string.c:915
#define FL_FREEZE
Definition: ruby.h:1216
VALUE rb_external_str_new(const char *ptr, long len)
Definition: string.c:1055
ONIG_EXTERN OnigPosition onig_match(OnigRegex, const OnigUChar *str, const OnigUChar *end, const OnigUChar *at, OnigRegion *region, OnigOptionType option)
Definition: id.h:99
VALUE rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
Definition: string.c:4298
VALUE rb_str_succ(VALUE orig)
Definition: string.c:4030
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1305
#define MIN_PRE_ALLOC_SIZE
Definition: string.c:2906
#define RSTRING_LENINT(str)
Definition: ruby.h:983
ONIG_EXTERN int onigenc_ascii_only_case_map(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: regenc.c:955
rb_encoding * rb_enc_check_str(VALUE str1, VALUE str2)
Definition: encoding.c:868
#define rb_check_frozen(obj)
Definition: intern.h:271
#define CONST_ID(var, str)
Definition: ruby.h:1743
VALUE rb_str_intern(VALUE)
Definition: symbol.c:661
VALUE rb_str_inspect(VALUE str)
Definition: string.c:5813
#define rb_intern_const(str)
Definition: ruby.h:1777
#define RSTRING_EMBED_LEN(str)
Definition: ruby.h:968
#define SPECIAL_CONST_P(x)
Definition: ruby.h:1242
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2884
void void xfree(void *)
VALUE rb_str_buf_new(long capa)
Definition: string.c:1282
int rb_num_to_uint(VALUE val, unsigned int *ret)
Definition: numeric.c:242
VALUE rb_define_module(const char *name)
Definition: class.c:768
int rb_str_comparable(VALUE str1, VALUE str2)
Definition: string.c:3134
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:211
#define SYMBOL_P(x)
Definition: ruby.h:382
#define mod(x, y)
Definition: date_strftime.c:28
VALUE rb_str_ord(VALUE s)
Definition: string.c:8909
#define RB_INTEGER_TYPE_P(obj)
Definition: ruby_missing.h:15
#define rb_str_dup_frozen
#define ONIGENC_CTYPE_DIGIT
Definition: onigmo.h:298
#define OBJ_INFECT_RAW(x, s)
Definition: ruby.h:1301
#define NULL
Definition: _sdbm.c:102
#define FIX2LONG(x)
Definition: ruby.h:363
#define Qundef
Definition: ruby.h:439
VALUE rb_invcmp(VALUE x, VALUE y)
Definition: compar.c:46
VALUE rb_str_resurrect(VALUE str)
Definition: string.c:1494
VALUE rb_check_string_type(VALUE str)
Definition: string.c:2246
VALUE rb_usascii_str_new_cstr(const char *ptr)
Definition: string.c:778
#define OBJ_TAINT(x)
Definition: ruby.h:1298
VALUE rb_id_quote_unprintable(ID id)
Definition: string.c:10129
VALUE rb_reg_regcomp(VALUE)
Definition: re.c:2936
#define ST2FIX(h)
Definition: ruby_missing.h:21
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1515
#define ENCINDEX_UTF_32LE
Definition: encindex.h:48
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:9575
#define bp()
Definition: vm_debug.h:25
#define ENCINDEX_UTF_32BE
Definition: encindex.h:47
#define rb_enc_prev_char(s, p, e, enc)
Definition: encoding.h:214
#define T_REGEXP
Definition: ruby.h:497
#define STR_HEAP_SIZE(str)
Definition: string.c:162
#define IS_EVSTR(p, e)
Definition: string.c:5907
VALUE rb_str_dump(VALUE str)
Definition: string.c:5920
#define NUM2LONG(x)
Definition: ruby.h:648
#define STR_NOEMBED
Definition: internal.h:1650
#define TR_TABLE_SIZE
Definition: string.c:6885
VALUE rb_reg_nth_match(int, VALUE)
Definition: re.c:1679
#define rb_enc_code_to_mbclen(c, enc)
Definition: encoding.h:208
void rb_str_modify(VALUE str)
Definition: string.c:2046
#define STR_EMBED_P(str)
Definition: internal.h:1652
#define ONIGENC_MBCLEN_CHARFOUND_P(r)
Definition: onigmo.h:346
char ** argv
Definition: ruby.c:188
ID rb_to_id(VALUE name)
Definition: string.c:10496
char * ptr
Definition: ruby.h:959
#define DBL2NUM(dbl)
Definition: ruby.h:934
#define StringValue(v)
Definition: ruby.h:569
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:759
VALUE rb_external_str_new_cstr(const char *ptr)
Definition: string.c:1061
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:616
#define rb_sym2str(sym)
Definition: console.c:107
#define LIKELY(x)
Definition: internal.h:42
VALUE rb_str_dup(VALUE str)
Definition: string.c:1488
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Definition: string.c:1265
char * rb_str_fill_terminator(VALUE str, const int newminlen)
Definition: string.c:2238