Ruby  2.5.0dev(2017-10-22revision60238)
escape.c
Go to the documentation of this file.
1 #include "ruby.h"
2 #include "ruby/encoding.h"
3 
4 RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow);
7 #define lower_hexdigits (ruby_hexdigits+0)
8 #define upper_hexdigits (ruby_hexdigits+16)
9 #define char_to_number(c) ruby_digit36_to_number_table[(unsigned char)(c)]
10 
11 static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
12 static ID id_accept_charset;
13 
14 static void
15 html_escaped_cat(VALUE str, char c)
16 {
17  switch (c) {
18  case '\'':
19  rb_str_cat_cstr(str, "'");
20  break;
21  case '&':
22  rb_str_cat_cstr(str, "&");
23  break;
24  case '"':
25  rb_str_cat_cstr(str, """);
26  break;
27  case '<':
28  rb_str_cat_cstr(str, "&lt;");
29  break;
30  case '>':
31  rb_str_cat_cstr(str, "&gt;");
32  break;
33  }
34 }
35 
36 static inline void
37 preserve_original_state(VALUE orig, VALUE dest)
38 {
39  rb_enc_associate(dest, rb_enc_get(orig));
40 
41  RB_OBJ_INFECT_RAW(dest, orig);
42 }
43 
44 static VALUE
45 optimized_escape_html(VALUE str)
46 {
47  long i, len, beg = 0;
48  VALUE dest = 0;
49  const char *cstr;
50 
51  len = RSTRING_LEN(str);
52  cstr = RSTRING_PTR(str);
53 
54  for (i = 0; i < len; i++) {
55  switch (cstr[i]) {
56  case '\'':
57  case '&':
58  case '"':
59  case '<':
60  case '>':
61  if (!dest) {
62  dest = rb_str_buf_new(len);
63  }
64 
65  rb_str_cat(dest, cstr + beg, i - beg);
66  beg = i + 1;
67 
68  html_escaped_cat(dest, cstr[i]);
69  break;
70  }
71  }
72 
73  if (dest) {
74  rb_str_cat(dest, cstr + beg, len - beg);
75  preserve_original_state(str, dest);
76  return dest;
77  }
78  else {
79  return rb_str_dup(str);
80  }
81 }
82 
83 static VALUE
84 optimized_unescape_html(VALUE str)
85 {
86  enum {UNICODE_MAX = 0x10ffff};
87  rb_encoding *enc = rb_enc_get(str);
88  unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX :
89  strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 :
90  128);
91  long i, len, beg = 0;
92  size_t clen, plen;
93  int overflow;
94  const char *cstr;
95  char buf[6];
96  VALUE dest = 0;
97 
98  len = RSTRING_LEN(str);
99  cstr = RSTRING_PTR(str);
100 
101  for (i = 0; i < len; i++) {
102  unsigned long cc;
103  char c = cstr[i];
104  if (c != '&') continue;
105  plen = i - beg;
106  if (++i >= len) break;
107  c = (unsigned char)cstr[i];
108 #define MATCH(s) (len - i >= (int)rb_strlen_lit(s) && \
109  memcmp(&cstr[i], s, rb_strlen_lit(s)) == 0 && \
110  (i += rb_strlen_lit(s) - 1, 1))
111  switch (c) {
112  case 'a':
113  ++i;
114  if (MATCH("pos;")) {
115  c = '\'';
116  }
117  else if (MATCH("mp;")) {
118  c = '&';
119  }
120  else continue;
121  break;
122  case 'q':
123  ++i;
124  if (MATCH("uot;")) {
125  c = '"';
126  }
127  else continue;
128  break;
129  case 'g':
130  ++i;
131  if (MATCH("t;")) {
132  c = '>';
133  }
134  else continue;
135  break;
136  case 'l':
137  ++i;
138  if (MATCH("t;")) {
139  c = '<';
140  }
141  else continue;
142  break;
143  case '#':
144  if (len - ++i >= 2 && ISDIGIT(cstr[i])) {
145  cc = ruby_scan_digits(&cstr[i], len-i, 10, &clen, &overflow);
146  }
147  else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) {
148  cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow);
149  }
150  else continue;
151  i += clen;
152  if (overflow || cc >= charlimit || cstr[i] != ';') continue;
153  if (!dest) {
154  dest = rb_str_buf_new(len);
155  }
156  rb_str_cat(dest, cstr + beg, plen);
157  if (charlimit > 256) {
158  rb_str_cat(dest, buf, rb_enc_mbcput((OnigCodePoint)cc, buf, enc));
159  }
160  else {
161  c = (unsigned char)cc;
162  rb_str_cat(dest, &c, 1);
163  }
164  beg = i + 1;
165  continue;
166  default:
167  --i;
168  continue;
169  }
170  if (!dest) {
171  dest = rb_str_buf_new(len);
172  }
173  rb_str_cat(dest, cstr + beg, plen);
174  rb_str_cat(dest, &c, 1);
175  beg = i + 1;
176  }
177 
178  if (dest) {
179  rb_str_cat(dest, cstr + beg, len - beg);
180  preserve_original_state(str, dest);
181  return dest;
182  }
183  else {
184  return rb_str_dup(str);
185  }
186 }
187 
188 static unsigned char
189 url_unreserved_char(unsigned char c)
190 {
191  switch (c) {
192  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
193  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j':
194  case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't':
195  case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
196  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J':
197  case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':
198  case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
199  case '-': case '.': case '_': case '~':
200  return 1;
201  default:
202  break;
203  }
204  return 0;
205 }
206 
207 static VALUE
208 optimized_escape(VALUE str)
209 {
210  long i, len, beg = 0;
211  VALUE dest = 0;
212  const char *cstr;
213  char buf[4] = {'%'};
214 
215  len = RSTRING_LEN(str);
216  cstr = RSTRING_PTR(str);
217 
218  for (i = 0; i < len; ++i) {
219  const unsigned char c = (unsigned char)cstr[i];
220  if (!url_unreserved_char(c)) {
221  if (!dest) {
222  dest = rb_str_buf_new(len);
223  }
224 
225  rb_str_cat(dest, cstr + beg, i - beg);
226  beg = i + 1;
227 
228  if (c == ' ') {
229  rb_str_cat_cstr(dest, "+");
230  }
231  else {
232  buf[1] = upper_hexdigits[(c >> 4) & 0xf];
233  buf[2] = upper_hexdigits[c & 0xf];
234  rb_str_cat(dest, buf, 3);
235  }
236  }
237  }
238 
239  if (dest) {
240  rb_str_cat(dest, cstr + beg, len - beg);
241  preserve_original_state(str, dest);
242  return dest;
243  }
244  else {
245  return rb_str_dup(str);
246  }
247 }
248 
249 static VALUE
250 optimized_unescape(VALUE str, VALUE encoding)
251 {
252  long i, len, beg = 0;
253  VALUE dest = 0;
254  const char *cstr;
255  rb_encoding *enc = rb_to_encoding(encoding);
256  int cr, origenc, encidx = rb_enc_to_index(enc);
257 
258  len = RSTRING_LEN(str);
259  cstr = RSTRING_PTR(str);
260 
261  for (i = 0; i < len; ++i) {
262  char buf[1];
263  const char c = cstr[i];
264  int clen = 0;
265  if (c == '%') {
266  if (i + 3 > len) break;
267  if (!ISXDIGIT(cstr[i+1])) continue;
268  if (!ISXDIGIT(cstr[i+2])) continue;
269  buf[0] = ((char_to_number(cstr[i+1]) << 4)
270  | char_to_number(cstr[i+2]));
271  clen = 2;
272  }
273  else if (c == '+') {
274  buf[0] = ' ';
275  }
276  else {
277  continue;
278  }
279 
280  if (!dest) {
281  dest = rb_str_buf_new(len);
282  }
283 
284  rb_str_cat(dest, cstr + beg, i - beg);
285  i += clen;
286  beg = i + 1;
287 
288  rb_str_cat(dest, buf, 1);
289  }
290 
291  if (dest) {
292  rb_str_cat(dest, cstr + beg, len - beg);
293  preserve_original_state(str, dest);
295  }
296  else {
297  dest = rb_str_dup(str);
298  cr = ENC_CODERANGE(str);
299  }
300  origenc = rb_enc_get_index(str);
301  if (origenc != encidx) {
302  rb_enc_associate_index(dest, encidx);
304  rb_enc_associate_index(dest, origenc);
305  if (cr != ENC_CODERANGE_UNKNOWN)
306  ENC_CODERANGE_SET(dest, cr);
307  }
308  }
309  return dest;
310 }
311 
312 /*
313  * call-seq:
314  * CGI.escapeHTML(string) -> string
315  *
316  * Returns HTML-escaped string.
317  *
318  */
319 static VALUE
320 cgiesc_escape_html(VALUE self, VALUE str)
321 {
322  StringValue(str);
323 
324  if (rb_enc_str_asciicompat_p(str)) {
325  return optimized_escape_html(str);
326  }
327  else {
328  return rb_call_super(1, &str);
329  }
330 }
331 
332 /*
333  * call-seq:
334  * CGI.unescapeHTML(string) -> string
335  *
336  * Returns HTML-unescaped string.
337  *
338  */
339 static VALUE
340 cgiesc_unescape_html(VALUE self, VALUE str)
341 {
342  StringValue(str);
343 
344  if (rb_enc_str_asciicompat_p(str)) {
345  return optimized_unescape_html(str);
346  }
347  else {
348  return rb_call_super(1, &str);
349  }
350 }
351 
352 /*
353  * call-seq:
354  * CGI.escape(string) -> string
355  *
356  * Returns URL-escaped string.
357  *
358  */
359 static VALUE
360 cgiesc_escape(VALUE self, VALUE str)
361 {
362  StringValue(str);
363 
364  if (rb_enc_str_asciicompat_p(str)) {
365  return optimized_escape(str);
366  }
367  else {
368  return rb_call_super(1, &str);
369  }
370 }
371 
372 static VALUE
373 accept_charset(int argc, VALUE *argv, VALUE self)
374 {
375  if (argc > 0)
376  return argv[0];
377  return rb_cvar_get(CLASS_OF(self), id_accept_charset);
378 }
379 
380 /*
381  * call-seq:
382  * CGI.unescape(string, encoding=@@accept_charset) -> string
383  *
384  * Returns URL-unescaped string.
385  *
386  */
387 static VALUE
388 cgiesc_unescape(int argc, VALUE *argv, VALUE self)
389 {
390  VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
391 
392  StringValue(str);
393 
394  if (rb_enc_str_asciicompat_p(str)) {
395  VALUE enc = accept_charset(argc-1, argv+1, self);
396  return optimized_unescape(str, enc);
397  }
398  else {
399  return rb_call_super(argc, argv);
400  }
401 }
402 
403 void
405 {
406  id_accept_charset = rb_intern_const("@@accept_charset");
407  InitVM(escape);
408 }
409 
410 void
412 {
413  rb_cCGI = rb_define_class("CGI", rb_cObject);
414  rb_mEscape = rb_define_module_under(rb_cCGI, "Escape");
415  rb_mUtil = rb_define_module_under(rb_cCGI, "Util");
416  rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
417  rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
418  rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
419  rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
420  rb_prepend_module(rb_mUtil, rb_mEscape);
421  rb_extend_object(rb_cCGI, rb_mEscape);
422 }
VALUE rb_cvar_get(VALUE, ID)
Definition: variable.c:2879
#define ISDIGIT(c)
Definition: ruby.h:2150
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:773
unsigned int OnigCodePoint
Definition: onigmo.h:80
#define CLASS_OF(v)
Definition: ruby.h:453
#define InitVM(ext)
Definition: ruby.h:2164
VALUE rb_str_cat(VALUE, const char *, long)
Definition: string.c:2746
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:106
#define MATCH(s)
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:246
#define strcasecmp
Definition: win32.h:191
#define rb_check_arity
Definition: intern.h:298
int rb_enc_str_coderange(VALUE)
Definition: string.c:621
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:854
#define upper_hexdigits
Definition: escape.c:8
RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow)
Definition: util.c:84
void rb_prepend_module(VALUE klass, VALUE module)
Definition: class.c:973
#define UNICODE_MAX
Definition: nkf.c:427
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:126
RUBY_EXTERN VALUE rb_cObject
Definition: ruby.h:1893
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:646
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:103
void InitVM_escape(void)
Definition: escape.c:411
int argc
Definition: ruby.c:187
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:99
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:826
#define char_to_number(c)
Definition: escape.c:9
#define RSTRING_LEN(str)
Definition: ruby.h:971
#define RUBY_EXTERN
Definition: missing.h:77
#define rb_enc_name(enc)
Definition: encoding.h:171
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4309
unsigned long ID
Definition: ruby.h:86
unsigned long VALUE
Definition: ruby.h:85
#define rb_enc_str_asciicompat_p(str)
Definition: encoding.h:251
VALUE rb_call_super(int, const VALUE *)
Definition: vm_eval.c:238
VALUE rb_str_dup(VALUE)
Definition: string.c:1488
register unsigned int len
Definition: zonetab.h:51
VALUE rb_define_module_under(VALUE outer, const char *name)
Definition: class.c:790
#define RSTRING_PTR(str)
Definition: ruby.h:975
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:860
void Init_escape(void)
Definition: escape.c:404
void rb_extend_object(VALUE obj, VALUE module)
Extend the object with the module.
Definition: eval.c:1596
#define ENC_CODERANGE(obj)
Definition: encoding.h:104
VALUE rb_str_cat_cstr(VALUE, const char *)
Definition: string.c:2756
RUBY_EXTERN const signed char ruby_digit36_to_number_table[]
Definition: escape.c:6
RUBY_EXTERN const char ruby_hexdigits[]
Definition: escape.c:5
#define rb_intern_const(str)
Definition: ruby.h:1777
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:211
VALUE rb_str_buf_new(long)
Definition: string.c:1282
#define ISXDIGIT(c)
Definition: ruby.h:2151
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1515
#define RB_OBJ_INFECT_RAW(x, s)
Definition: ruby.h:1265
char ** argv
Definition: ruby.c:188
#define StringValue(v)
Definition: ruby.h:569