Ruby  2.5.0dev(2017-10-22revision60238)
psych_parser.c
Go to the documentation of this file.
1 #include <psych.h>
2 
5 
6 static ID id_read;
7 static ID id_path;
8 static ID id_empty;
9 static ID id_start_stream;
10 static ID id_end_stream;
11 static ID id_start_document;
12 static ID id_end_document;
13 static ID id_alias;
14 static ID id_scalar;
15 static ID id_start_sequence;
16 static ID id_end_sequence;
17 static ID id_start_mapping;
18 static ID id_end_mapping;
19 
20 #define PSYCH_TRANSCODE(_str, _yaml_enc, _internal_enc) \
21  do { \
22  rb_enc_associate_index((_str), (_yaml_enc)); \
23  if(_internal_enc) \
24  (_str) = rb_str_export_to_enc((_str), (_internal_enc)); \
25  } while (0)
26 
27 static int io_reader(void * data, unsigned char *buf, size_t size, size_t *read)
28 {
29  VALUE io = (VALUE)data;
30  VALUE string = rb_funcall(io, id_read, 1, INT2NUM(size));
31 
32  *read = 0;
33 
34  if(! NIL_P(string)) {
35  void * str = (void *)StringValuePtr(string);
36  *read = (size_t)RSTRING_LEN(string);
37  memcpy(buf, str, *read);
38  }
39 
40  return 1;
41 }
42 
43 static void dealloc(void * ptr)
44 {
45  yaml_parser_t * parser;
46 
47  parser = (yaml_parser_t *)ptr;
48  yaml_parser_delete(parser);
49  xfree(parser);
50 }
51 
52 #if 0
53 static size_t memsize(const void *ptr)
54 {
55  const yaml_parser_t *parser = ptr;
56  /* TODO: calculate parser's size */
57  return 0;
58 }
59 #endif
60 
61 static const rb_data_type_t psych_parser_type = {
62  "Psych/parser",
63  {0, dealloc, 0,},
64  0, 0,
65 #ifdef RUBY_TYPED_FREE_IMMEDIATELY
67 #endif
68 };
69 
70 static VALUE allocate(VALUE klass)
71 {
72  yaml_parser_t * parser;
73  VALUE obj = TypedData_Make_Struct(klass, yaml_parser_t, &psych_parser_type, parser);
74 
75  yaml_parser_initialize(parser);
76 
77  return obj;
78 }
79 
80 static VALUE make_exception(yaml_parser_t * parser, VALUE path)
81 {
82  size_t line, column;
83 
84  line = parser->context_mark.line + 1;
85  column = parser->context_mark.column + 1;
86 
87  return rb_funcall(ePsychSyntaxError, rb_intern("new"), 6,
88  path,
89  INT2NUM(line),
90  INT2NUM(column),
91  INT2NUM(parser->problem_offset),
92  parser->problem ? rb_usascii_str_new2(parser->problem) : Qnil,
93  parser->context ? rb_usascii_str_new2(parser->context) : Qnil);
94 }
95 
96 static VALUE transcode_string(VALUE src, int * parser_encoding)
97 {
98  int utf8 = rb_utf8_encindex();
99  int utf16le = rb_enc_find_index("UTF-16LE");
100  int utf16be = rb_enc_find_index("UTF-16BE");
101  int source_encoding = rb_enc_get_index(src);
102 
103  if (source_encoding == utf8) {
104  *parser_encoding = YAML_UTF8_ENCODING;
105  return src;
106  }
107 
108  if (source_encoding == utf16le) {
109  *parser_encoding = YAML_UTF16LE_ENCODING;
110  return src;
111  }
112 
113  if (source_encoding == utf16be) {
114  *parser_encoding = YAML_UTF16BE_ENCODING;
115  return src;
116  }
117 
119  RB_GC_GUARD(src);
120 
121  *parser_encoding = YAML_UTF8_ENCODING;
122  return src;
123 }
124 
125 static VALUE transcode_io(VALUE src, int * parser_encoding)
126 {
127  VALUE io_external_encoding;
128  int io_external_enc_index;
129 
130  io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0);
131 
132  /* if no encoding is returned, assume ascii8bit. */
133  if (NIL_P(io_external_encoding)) {
134  io_external_enc_index = rb_ascii8bit_encindex();
135  } else {
136  io_external_enc_index = rb_to_encoding_index(io_external_encoding);
137  }
138 
139  /* Treat US-ASCII as utf_8 */
140  if (io_external_enc_index == rb_usascii_encindex()) {
141  *parser_encoding = YAML_UTF8_ENCODING;
142  return src;
143  }
144 
145  if (io_external_enc_index == rb_utf8_encindex()) {
146  *parser_encoding = YAML_UTF8_ENCODING;
147  return src;
148  }
149 
150  if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) {
151  *parser_encoding = YAML_UTF16LE_ENCODING;
152  return src;
153  }
154 
155  if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) {
156  *parser_encoding = YAML_UTF16BE_ENCODING;
157  return src;
158  }
159 
160  /* Just guess on ASCII-8BIT */
161  if (io_external_enc_index == rb_ascii8bit_encindex()) {
162  *parser_encoding = YAML_ANY_ENCODING;
163  return src;
164  }
165 
166  /* If the external encoding is something we don't know how to handle,
167  * fall back to YAML_ANY_ENCODING. */
168  *parser_encoding = YAML_ANY_ENCODING;
169 
170  return src;
171 }
172 
173 static VALUE protected_start_stream(VALUE pointer)
174 {
175  VALUE *args = (VALUE *)pointer;
176  return rb_funcall(args[0], id_start_stream, 1, args[1]);
177 }
178 
179 static VALUE protected_start_document(VALUE pointer)
180 {
181  VALUE *args = (VALUE *)pointer;
182  return rb_funcall3(args[0], id_start_document, 3, args + 1);
183 }
184 
185 static VALUE protected_end_document(VALUE pointer)
186 {
187  VALUE *args = (VALUE *)pointer;
188  return rb_funcall(args[0], id_end_document, 1, args[1]);
189 }
190 
191 static VALUE protected_alias(VALUE pointer)
192 {
193  VALUE *args = (VALUE *)pointer;
194  return rb_funcall(args[0], id_alias, 1, args[1]);
195 }
196 
197 static VALUE protected_scalar(VALUE pointer)
198 {
199  VALUE *args = (VALUE *)pointer;
200  return rb_funcall3(args[0], id_scalar, 6, args + 1);
201 }
202 
203 static VALUE protected_start_sequence(VALUE pointer)
204 {
205  VALUE *args = (VALUE *)pointer;
206  return rb_funcall3(args[0], id_start_sequence, 4, args + 1);
207 }
208 
209 static VALUE protected_end_sequence(VALUE handler)
210 {
211  return rb_funcall(handler, id_end_sequence, 0);
212 }
213 
214 static VALUE protected_start_mapping(VALUE pointer)
215 {
216  VALUE *args = (VALUE *)pointer;
217  return rb_funcall3(args[0], id_start_mapping, 4, args + 1);
218 }
219 
220 static VALUE protected_end_mapping(VALUE handler)
221 {
222  return rb_funcall(handler, id_end_mapping, 0);
223 }
224 
225 static VALUE protected_empty(VALUE handler)
226 {
227  return rb_funcall(handler, id_empty, 0);
228 }
229 
230 static VALUE protected_end_stream(VALUE handler)
231 {
232  return rb_funcall(handler, id_end_stream, 0);
233 }
234 
235 /*
236  * call-seq:
237  * parser.parse(yaml)
238  *
239  * Parse the YAML document contained in +yaml+. Events will be called on
240  * the handler set on the parser instance.
241  *
242  * See Psych::Parser and Psych::Parser#handler
243  */
244 static VALUE parse(int argc, VALUE *argv, VALUE self)
245 {
246  VALUE yaml, path;
247  yaml_parser_t * parser;
248  yaml_event_t event;
249  int done = 0;
250  int tainted = 0;
251  int state = 0;
252  int parser_encoding = YAML_ANY_ENCODING;
253  int encoding = rb_utf8_encindex();
254  rb_encoding * internal_enc = rb_default_internal_encoding();
255  VALUE handler = rb_iv_get(self, "@handler");
256 
257  if (rb_scan_args(argc, argv, "11", &yaml, &path) == 1) {
258  if(rb_respond_to(yaml, id_path))
259  path = rb_funcall(yaml, id_path, 0);
260  else
261  path = rb_str_new2("<unknown>");
262  }
263 
264  TypedData_Get_Struct(self, yaml_parser_t, &psych_parser_type, parser);
265 
266  yaml_parser_delete(parser);
267  yaml_parser_initialize(parser);
268 
269  if (OBJ_TAINTED(yaml)) tainted = 1;
270 
271  if (rb_respond_to(yaml, id_read)) {
272  yaml = transcode_io(yaml, &parser_encoding);
273  yaml_parser_set_encoding(parser, parser_encoding);
274  yaml_parser_set_input(parser, io_reader, (void *)yaml);
275  if (RTEST(rb_obj_is_kind_of(yaml, rb_cIO))) tainted = 1;
276  } else {
277  StringValue(yaml);
278  yaml = transcode_string(yaml, &parser_encoding);
279  yaml_parser_set_encoding(parser, parser_encoding);
280  yaml_parser_set_input_string(
281  parser,
282  (const unsigned char *)RSTRING_PTR(yaml),
283  (size_t)RSTRING_LEN(yaml)
284  );
285  }
286 
287  while(!done) {
288  if(!yaml_parser_parse(parser, &event)) {
289  VALUE exception;
290 
291  exception = make_exception(parser, path);
292  yaml_parser_delete(parser);
293  yaml_parser_initialize(parser);
294 
295  rb_exc_raise(exception);
296  }
297 
298  switch(event.type) {
299  case YAML_STREAM_START_EVENT:
300  {
301  VALUE args[2];
302 
303  args[0] = handler;
304  args[1] = INT2NUM((long)event.data.stream_start.encoding);
305  rb_protect(protected_start_stream, (VALUE)args, &state);
306  }
307  break;
308  case YAML_DOCUMENT_START_EVENT:
309  {
310  VALUE args[4];
311  /* Get a list of tag directives (if any) */
312  VALUE tag_directives = rb_ary_new();
313  /* Grab the document version */
314  VALUE version = event.data.document_start.version_directive ?
315  rb_ary_new3(
316  (long)2,
317  INT2NUM((long)event.data.document_start.version_directive->major),
318  INT2NUM((long)event.data.document_start.version_directive->minor)
319  ) : rb_ary_new();
320 
321  if(event.data.document_start.tag_directives.start) {
322  yaml_tag_directive_t *start =
323  event.data.document_start.tag_directives.start;
324  yaml_tag_directive_t *end =
325  event.data.document_start.tag_directives.end;
326  for(; start != end; start++) {
327  VALUE handle = Qnil;
328  VALUE prefix = Qnil;
329  if(start->handle) {
330  handle = rb_str_new2((const char *)start->handle);
331  if (tainted) OBJ_TAINT(handle);
332  PSYCH_TRANSCODE(handle, encoding, internal_enc);
333  }
334 
335  if(start->prefix) {
336  prefix = rb_str_new2((const char *)start->prefix);
337  if (tainted) OBJ_TAINT(prefix);
338  PSYCH_TRANSCODE(prefix, encoding, internal_enc);
339  }
340 
341  rb_ary_push(tag_directives, rb_ary_new3((long)2, handle, prefix));
342  }
343  }
344  args[0] = handler;
345  args[1] = version;
346  args[2] = tag_directives;
347  args[3] = event.data.document_start.implicit == 1 ? Qtrue : Qfalse;
348  rb_protect(protected_start_document, (VALUE)args, &state);
349  }
350  break;
351  case YAML_DOCUMENT_END_EVENT:
352  {
353  VALUE args[2];
354 
355  args[0] = handler;
356  args[1] = event.data.document_end.implicit == 1 ? Qtrue : Qfalse;
357  rb_protect(protected_end_document, (VALUE)args, &state);
358  }
359  break;
360  case YAML_ALIAS_EVENT:
361  {
362  VALUE args[2];
363  VALUE alias = Qnil;
364  if(event.data.alias.anchor) {
365  alias = rb_str_new2((const char *)event.data.alias.anchor);
366  if (tainted) OBJ_TAINT(alias);
367  PSYCH_TRANSCODE(alias, encoding, internal_enc);
368  }
369 
370  args[0] = handler;
371  args[1] = alias;
372  rb_protect(protected_alias, (VALUE)args, &state);
373  }
374  break;
375  case YAML_SCALAR_EVENT:
376  {
377  VALUE args[7];
378  VALUE anchor = Qnil;
379  VALUE tag = Qnil;
380  VALUE plain_implicit, quoted_implicit, style;
381  VALUE val = rb_str_new(
382  (const char *)event.data.scalar.value,
383  (long)event.data.scalar.length
384  );
385  if (tainted) OBJ_TAINT(val);
386 
387  PSYCH_TRANSCODE(val, encoding, internal_enc);
388 
389  if(event.data.scalar.anchor) {
390  anchor = rb_str_new2((const char *)event.data.scalar.anchor);
391  if (tainted) OBJ_TAINT(anchor);
392  PSYCH_TRANSCODE(anchor, encoding, internal_enc);
393  }
394 
395  if(event.data.scalar.tag) {
396  tag = rb_str_new2((const char *)event.data.scalar.tag);
397  if (tainted) OBJ_TAINT(tag);
398  PSYCH_TRANSCODE(tag, encoding, internal_enc);
399  }
400 
401  plain_implicit =
402  event.data.scalar.plain_implicit == 0 ? Qfalse : Qtrue;
403 
404  quoted_implicit =
405  event.data.scalar.quoted_implicit == 0 ? Qfalse : Qtrue;
406 
407  style = INT2NUM((long)event.data.scalar.style);
408 
409  args[0] = handler;
410  args[1] = val;
411  args[2] = anchor;
412  args[3] = tag;
413  args[4] = plain_implicit;
414  args[5] = quoted_implicit;
415  args[6] = style;
416  rb_protect(protected_scalar, (VALUE)args, &state);
417  }
418  break;
419  case YAML_SEQUENCE_START_EVENT:
420  {
421  VALUE args[5];
422  VALUE anchor = Qnil;
423  VALUE tag = Qnil;
424  VALUE implicit, style;
425  if(event.data.sequence_start.anchor) {
426  anchor = rb_str_new2((const char *)event.data.sequence_start.anchor);
427  if (tainted) OBJ_TAINT(anchor);
428  PSYCH_TRANSCODE(anchor, encoding, internal_enc);
429  }
430 
431  tag = Qnil;
432  if(event.data.sequence_start.tag) {
433  tag = rb_str_new2((const char *)event.data.sequence_start.tag);
434  if (tainted) OBJ_TAINT(tag);
435  PSYCH_TRANSCODE(tag, encoding, internal_enc);
436  }
437 
438  implicit =
439  event.data.sequence_start.implicit == 0 ? Qfalse : Qtrue;
440 
441  style = INT2NUM((long)event.data.sequence_start.style);
442 
443  args[0] = handler;
444  args[1] = anchor;
445  args[2] = tag;
446  args[3] = implicit;
447  args[4] = style;
448 
449  rb_protect(protected_start_sequence, (VALUE)args, &state);
450  }
451  break;
452  case YAML_SEQUENCE_END_EVENT:
453  rb_protect(protected_end_sequence, handler, &state);
454  break;
455  case YAML_MAPPING_START_EVENT:
456  {
457  VALUE args[5];
458  VALUE anchor = Qnil;
459  VALUE tag = Qnil;
460  VALUE implicit, style;
461  if(event.data.mapping_start.anchor) {
462  anchor = rb_str_new2((const char *)event.data.mapping_start.anchor);
463  if (tainted) OBJ_TAINT(anchor);
464  PSYCH_TRANSCODE(anchor, encoding, internal_enc);
465  }
466 
467  if(event.data.mapping_start.tag) {
468  tag = rb_str_new2((const char *)event.data.mapping_start.tag);
469  if (tainted) OBJ_TAINT(tag);
470  PSYCH_TRANSCODE(tag, encoding, internal_enc);
471  }
472 
473  implicit =
474  event.data.mapping_start.implicit == 0 ? Qfalse : Qtrue;
475 
476  style = INT2NUM((long)event.data.mapping_start.style);
477 
478  args[0] = handler;
479  args[1] = anchor;
480  args[2] = tag;
481  args[3] = implicit;
482  args[4] = style;
483 
484  rb_protect(protected_start_mapping, (VALUE)args, &state);
485  }
486  break;
487  case YAML_MAPPING_END_EVENT:
488  rb_protect(protected_end_mapping, handler, &state);
489  break;
490  case YAML_NO_EVENT:
491  rb_protect(protected_empty, handler, &state);
492  break;
493  case YAML_STREAM_END_EVENT:
494  rb_protect(protected_end_stream, handler, &state);
495  done = 1;
496  break;
497  }
498  yaml_event_delete(&event);
499  if (state) rb_jump_tag(state);
500  }
501 
502  return self;
503 }
504 
505 /*
506  * call-seq:
507  * parser.mark # => #<Psych::Parser::Mark>
508  *
509  * Returns a Psych::Parser::Mark object that contains line, column, and index
510  * information.
511  */
512 static VALUE mark(VALUE self)
513 {
514  VALUE mark_klass;
515  VALUE args[3];
516  yaml_parser_t * parser;
517 
518  TypedData_Get_Struct(self, yaml_parser_t, &psych_parser_type, parser);
519  mark_klass = rb_const_get_at(cPsychParser, rb_intern("Mark"));
520  args[0] = INT2NUM(parser->mark.index);
521  args[1] = INT2NUM(parser->mark.line);
522  args[2] = INT2NUM(parser->mark.column);
523 
524  return rb_class_new_instance(3, args, mark_klass);
525 }
526 
528 {
529 #if 0
530  mPsych = rb_define_module("Psych");
531 #endif
532 
535 
536  /* Any encoding: Let the parser choose the encoding */
537  rb_define_const(cPsychParser, "ANY", INT2NUM(YAML_ANY_ENCODING));
538 
539  /* UTF-8 Encoding */
540  rb_define_const(cPsychParser, "UTF8", INT2NUM(YAML_UTF8_ENCODING));
541 
542  /* UTF-16-LE Encoding with BOM */
543  rb_define_const(cPsychParser, "UTF16LE", INT2NUM(YAML_UTF16LE_ENCODING));
544 
545  /* UTF-16-BE Encoding with BOM */
546  rb_define_const(cPsychParser, "UTF16BE", INT2NUM(YAML_UTF16BE_ENCODING));
547 
548  rb_require("psych/syntax_error");
549  ePsychSyntaxError = rb_const_get(mPsych, rb_intern("SyntaxError"));
550 
551  rb_define_method(cPsychParser, "parse", parse, -1);
552  rb_define_method(cPsychParser, "mark", mark, 0);
553 
554  id_read = rb_intern("read");
555  id_path = rb_intern("path");
556  id_empty = rb_intern("empty");
557  id_start_stream = rb_intern("start_stream");
558  id_end_stream = rb_intern("end_stream");
559  id_start_document = rb_intern("start_document");
560  id_end_document = rb_intern("end_document");
561  id_alias = rb_intern("alias");
562  id_scalar = rb_intern("scalar");
563  id_start_sequence = rb_intern("start_sequence");
564  id_end_sequence = rb_intern("end_sequence");
565  id_start_mapping = rb_intern("start_mapping");
566  id_end_mapping = rb_intern("end_mapping");
567 }
568 /* vim: set noet sws=4 sw=4: */
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:773
VALUE rb_protect(VALUE(*proc)(VALUE), VALUE data, int *pstate)
Protects a function call from potential global escapes from the function.
Definition: eval.c:992
#define RUBY_TYPED_FREE_IMMEDIATELY
Definition: ruby.h:1138
#define INT2NUM(x)
Definition: ruby.h:1538
ID id_alias
Definition: eventids1.c:4
#define rb_usascii_str_new2
Definition: intern.h:841
void rb_jump_tag(int tag)
Continues the exception caught by rb_protect() and rb_eval_string_protect().
Definition: eval.c:821
#define Qtrue
Definition: ruby.h:437
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: ruby.h:1183
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1510
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:924
int rb_usascii_encindex(void)
Definition: encoding.c:1344
VALUE rb_funcall(VALUE, ID, int,...)
Calls a method.
Definition: vm_eval.c:774
VALUE rb_iv_get(VALUE, const char *)
Definition: variable.c:3087
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:693
#define RB_GC_GUARD(v)
Definition: ruby.h:552
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
const char * alias
Definition: nkf.c:1151
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1320
VALUE cPsychParser
Definition: psych_parser.c:3
VALUE rb_obj_is_kind_of(VALUE, VALUE)
call-seq: obj.is_a?(class) -> true or false obj.kind_of?(class) -> true or false
Definition: object.c:842
VALUE ePsychSyntaxError
Definition: psych_parser.c:4
VALUE rb_require(const char *)
Definition: load.c:1061
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:198
void Init_psych_parser(void)
Definition: psych_parser.c:527
#define val
RUBY_EXTERN VALUE rb_cObject
Definition: ruby.h:1893
VALUE rb_ary_new(void)
Definition: array.c:499
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1314
#define NIL_P(v)
Definition: ruby.h:451
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2691
int argc
Definition: ruby.c:187
#define Qfalse
Definition: ruby.h:436
#define PSYCH_TRANSCODE(_str, _yaml_enc, _internal_enc)
Definition: psych_parser.c:20
#define rb_str_new2
Definition: intern.h:835
VALUE rb_const_get(VALUE, ID)
Definition: variable.c:2292
RUBY_EXTERN VALUE rb_cIO
Definition: ruby.h:1913
#define RSTRING_LEN(str)
Definition: ruby.h:971
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1908
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4309
unsigned long ID
Definition: ruby.h:86
#define Qnil
Definition: ruby.h:438
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:615
unsigned long VALUE
Definition: ruby.h:85
#define OBJ_TAINTED(x)
Definition: ruby.h:1296
int rb_utf8_encindex(void)
Definition: encoding.c:1329
#define rb_ary_new3
Definition: intern.h:91
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1994
#define RSTRING_PTR(str)
Definition: ruby.h:975
int size
Definition: encoding.c:57
#define rb_funcall3
Definition: ruby.h:1792
#define RTEST(v)
Definition: ruby.h:450
VALUE rb_class_new_instance(int, const VALUE *, VALUE)
Allocates and initializes an instance of klass.
Definition: object.c:2170
VALUE mPsych
Definition: psych.c:21
VALUE rb_str_export_to_enc(VALUE, rb_encoding *)
Definition: string.c:1103
#define TypedData_Make_Struct(klass, type, data_type, sval)
Definition: ruby.h:1175
VALUE rb_const_get_at(VALUE, ID)
Definition: variable.c:2298
#define StringValuePtr(v)
Definition: ruby.h:570
int rb_enc_find_index(const char *name)
Definition: encoding.c:704
void void xfree(void *)
VALUE rb_define_module(const char *name)
Definition: class.c:768
#define rb_intern(str)
#define OBJ_TAINT(x)
Definition: ruby.h:1298
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1515
char ** argv
Definition: ruby.c:188
#define StringValue(v)
Definition: ruby.h:569
VALUE rb_str_new(const char *, long)
Definition: string.c:737