Upload
ujihisa
View
3.774
Download
0
Tags:
Embed Size (px)
DESCRIPTION
Nov 19, 2009
Citation preview
hi
• I'm from Japan
DISCLAIMER
•This presentation is not for super rubyists or ruby committers, but for ordinary programmers.
Hacking parse.y
•Ruby's syntax
Hacking parse.yFixing ruby parser to understand ruby
• Introducing new syntax
• {:key :-) "value"}
• 'symbol
• ++i
• def A#b(c)
• {|x| x * 2 }
MRI Inside
•MRI (Matz Ruby Implementation)
•$ ruby -vruby 1.9.2dev (2009-11-19 trunk 25862) [i386-darwin9.8.0]
•Written in C
•array.c, vm.c, gc.c, etc...
ruby 1.8 vs 1.9
•~1.8
• Parser: parse.y
• Evaluator: eval.c
•1.9~
• Parser: parse.y
• Evaluator: YARV (vm*.c)
Matz said
•Ugly: eval.c and parse.yRubyConf2006
•Now the original evaluator was all replaced with YARV
MRI Parser
•MRI uses yacc(parser generator for C)
•parse.ybison -d -o y.tab.c parse.ysed -f ./tool/ytab.sed -e "/^#/s!y\.tab\.c!parse.c!" y.tab.c > parse.c.new...
parse.y
•One of the darkest side
•$ wc -l *{c,h,y} | sort -n...9961 io.c10474 parse.y16367 parse.c # (automatically generated)
188656 total
(Broad) Parser
•Lexer (yylex)
•Bytes → Symbols
•Parser (yyparse)
•Symbols → Syntax Tree
Tokens in Lexer%token tUPLUS /* unary+ */%token tUMINUS /* unary- */%token tPOW /* ** */%token tCMP /* <=> */%token tEQ /* == */%token tEQQ /* === */%token tNEQ /* != */%token tGEQ /* >= */%token tLEQ /* <= */%token tANDOP tOROP /* && and || */%token tMATCH tNMATCH /* =~ and !~ */%token tDOT2 tDOT3 /* .. and ... */%token tAREF tASET /* [] and []= */%token tLSHFT tRSHFT /* << and >> */%token tCOLON2 /* :: */%token tCOLON3 /* :: at EXPR_BEG */
%token <id> tOP_ASGN /* +=, -= etc. */%token tASSOC /* => */%token tLPAREN /* ( */%token tLPAREN_ARG /* ( */%token tRPAREN /* ) */%token tLBRACK /* [ */%token tLBRACE /* { */%token tLBRACE_ARG /* { */%token tSTAR /* * */%token tAMPER /* & */%token tLAMBDA /* -> */%token tSYMBEG tSTRING_BEG tXSTRING_BEG tREGEXP_BEG tWORDS_BEG tQWORDS_BEG%token tSTRING_DBEG tSTRING_DVAR tSTRING_END tLAMBEG
(detour)
n MRI: parse.y (10474 lines)
n JRuby: src/org/jruby/{parser, lexer}/* (24983 lines)
n parser/DefaultRubyParser.y (1880 lines)parser/Ruby19Parser.y (2076 lines)
n Rubinius: lib/ext/melbourne/grammer.y (5891 lines) and others
Case 1::-)
•Hash literal{:key => 'value'}{:key :-) 'value'}
• :-) is just an alias of =>
Mastering “Colon”
Colons in Ruby
•A::B, ::C
• :symbol, :"sy-m-bol"
•a ? b : c
• {a: b}
•when 1: something (in 1.8)
static intparser_yylex(struct parser_params *parser) { ... switch (c = nextc()) { ... case '#': /* it's a comment */ ... case ':': c = nextc(); if (c == ':') { if (IS_BEG() ||... ... } ... (about 1300 lines)
How does parser deal with colon?
• :: → tCOLON2 or tCOLON3
•tCOLON2 Net::URI
•tCOLON3 ::Kernel
enum lex_state_e { EXPR_BEG, /* ignore newline, +/- is a sign. */ EXPR_END, /* newline significant, +/- is an operator. */ EXPR_ENDARG, /* ditto, and unbound braces. */ EXPR_ARG, /* newline significant, +/- is an operator. */ EXPR_CMDARG, /* newline significant, +/- is an operator. */ EXPR_MID, /* newline significant, +/- is an operator. */ EXPR_FNAME, /* ignore newline, no reserved words. */ EXPR_DOT, /* right after ̀ .' or ̀ ::', no reserved words. */ EXPR_CLASS, /* immediate after ̀ class', no here document. */ EXPR_VALUE /* alike EXPR_BEG but label is disallowed. */};
lex_state
case ':': c = nextc(); if (c == ':') { if (IS_BEG() || lex_state == EXPR_CLASS || (IS_ARG() && space_seen)) { lex_state = EXPR_BEG; return tCOLON3; } lex_state = EXPR_DOT; return tCOLON2; }
... if (lex_state == EXPR_END || lex_state == EXPR_ENDARG || (c != -1 && ISSPACE(c))) { pushback(c); lex_state = EXPR_BEG; return ':'; } switch (c) { case '\'': lex_strterm = NEW_STRTERM(str_ssym, c, 0); break; case '"': lex_strterm = NEW_STRTERM(str_dsym, c, 0); break; default: pushback(c); break; } lex_state = EXPR_FNAME; return tSYMBEG;
How does parser deal with colon? (summary)
• :: → tCOLON2 or tCOLON3
•EXPR_END or →: (else)
•otherwise → tSYMBEG
•:' → str_ssym
• :" → str_dsym
So,
• :-) → tASSOC
•:: → tCOLON2 or tCOLON3
•EXPR_END or →: (else)
•otherwise → tSYMBEG
•:' → str_ssym
• :" → str_dsym
:-)
DISCLAIMER
•This presentation is not for super rubyists or ruby committers, but for ordinary programmers.
Case 2:Lisp Like Symbol
•Symbol Literal:vancouver'vancouver
•Ad-hocp :a, :bp 'a, 'b
Single Quote(in parser_yylex)
...
case '\'':
lex_strterm = NEW_STRTERM(str_squote, '\'', 0);
return tSTRING_BEG;
...
Single Quote(in parser_yylex)
...
case '\'':
if (??? condition ???) {
lex_state = EXPR_FNAME;
return tSYMBEG;
}
lex_strterm = NEW_STRTERM(str_squote, '\'', 0);
return tSTRING_BEG;
...
(loop (lambda (p 'good)))
Case3: PreIncremental Operator
•++i
• i = i.succ(NOT i = i + 1)
Lexer@@ -685,6 +685,7 @@ static void token_info_pop(struct parser_params*, const char *token); %type <val> program reswords then do dot_or_colon %*/ %token tUPLUS /* unary+ */+%token tINCR /* ++var */ %token tUMINUS /* unary- */ %token tPOW /* ** */ %token tCMP /* <=> */
(Actually there are more trivial fixes)
regenerate id.h
• id.h is automatically generated by parse.y in make
•$ rm id.h$ make
parser examplevariable : tIDENTIFIER | tIVAR | tGVAR | tCONSTANT | tCVAR | keyword_nil {ifndef_ripper($$ = keyword_nil);} | keyword_self {ifndef_ripper($$ = keyword_self);} | keyword_true {ifndef_ripper($$ = keyword_true);} | keyword_false {ifndef_ripper($$ = keyword_false);} | keyword__FILE__ {ifndef_ripper($$ = keyword__FILE__);} | keyword__LINE__ {ifndef_ripper($$ = keyword__LINE__);} | keyword__ENCODING__ {ifndef_ripper($$ = keyword__ENCODING__);} ;
lhs : variable { /*%%%*/ if (!($$ = assignable($1, 0))) $$ = NEW_BEGIN(0); /*% $$ = dispatch1(var_field, $1); %*/ } | primary_value '[' opt_call_args rbracket { /*%%%*/ $$ = aryset($1, $3); /*% $$ = dispatch2(aref_field, $1, escape_Qundef($3)); %*/ } ...
BNF (part)program : compstmt
compstmt : stmts opt_terms
stmts : none | stmt | stmts terms stmt
stmt : kALIAS fitem fitem | kALIAS tGVAR tGVAR : : | expr
expr : kRETURN call_args | kBREAK call_args : : | '!' command_call | arg
arg : lhs '=' arg | var_lhs tOP_ASGN arg | primary_value '[' aref_args ']' tOP_ASGN arg : : | arg '?' arg ':' arg | primary
primary : literal | strings : : | tLPAREN_ARG expr ')' | tLPAREN compstmt ')' : : | kREDO | kRETRY
Assignstmt : ... | mlhs '=' command_call { /*%%%*/ value_expr($3); $1->nd_value = $3; $$ = $1; /*% $$ = dispatch2(massign, $1, $3); %*/ }
mlhsmlhs: mlhs_basic | ...mlhs_basic: mlhs_head | ...mlhs_head: mlhs_item ',' | ...mlhs_item: mlhs_node | ...mlhs_node: variable { $$ = assignable($1, 0); }
Method callblock_command : block_call| block_call '.' operation2 command_args { /*%%%*/ $$ = NEW_CALL($1, $3, $4); /*% $$ = dispatch3(call, $1, ripper_id2sym('.'), $3); $$ = method_arg($$, $4); %*/ }
Mix!var_ref: ...| tINCR variable { /*%%%*/ $$ = assignable($2, 0); $$->nd_value = NEW_CALL(gettable($$->nd_vid), rb_intern("succ"), 0); /*% $$ = dispatch2(unary, ripper_intern("++@"), $2); %*/ }
++ruby
Case 4:def A#b
•A#binstance method b of class A
•A.bclass method b of class A
A#b
class A def b ... endend
def A.b ...end
A#b
def A#b ...end
def A.b ...end
#(in parser_yylex)
case '#': /* it's a comment */
/* no magic_comment in shebang line */
if (!parser_magic_comment(parser, lex_p, lex_pend - lex_p)) {
if (comment_at_top(parser)) {
set_file_encoding(parser, lex_p, lex_pend);
}
}
lex_p = lex_pend;
#(in parser_yylex)
case '#': /* it's a comment */
c = nextc();
pushback(c);
if(lex_state == EXPR_END && ISALNUM(c)) return '#';
/* no magic_comment in shebang line */
if (!parser_magic_comment(parser, lex_p, lex_pend - lex_p)) {
if (comment_at_top(parser)) {
set_file_encoding(parser, lex_p, lex_pend);
Primaryprimary: literal | ... | k_def singleton dot_or_colon {lex_state = EXPR_FNAME;} fname { in_single++; lex_state = EXPR_END; /* force for args */ /*%%%*/ local_push(0); /*% %*/ } f_arglist bodystmt k_end { /*%%%*/ NODE *body = remove_begin($8); reduce_nodes(&body); $$ = NEW_DEFS($2, $5, $7, body); fixpos($$, $2); local_pop(); /*% $$ = dispatch5(defs, $2, $3, $5, $7, $8); %*/ in_single--; }
| k_def cname '#' {lex_state = EXPR_FNAME;} fname { $<id>$ = cur_mid; cur_mid = $5; in_def++; /*%%%*/ local_push(0); /*% %*/ } f_arglist bodystmt k_end { /*%%%*/ NODE *body = remove_begin($8); reduce_nodes(&body); $$ = NEW_DEFN($5, $7, body, NOEX_PRIVATE); fixpos($$, $7); fixpos($$->nd_defn, $7); $$ = NEW_CLASS(NEW_COLON3($2), $$, 0); nd_set_line($$, $<num>6); local_pop(); /*% $$ = dispatch4(defi, $2, $5, $7, $8); %*/ in_def--; cur_mid = $<id>6; }
ReferenceRubyソースコード完全解説
青木峰郎 著、まつもとゆきひろ 監修
Minero AOKI, Yukihiro MATSUMOTO "Ruby Hacking Guide"
HTML Version is available
Reference
•My bloghttp://ujihisa.blogspot.com
•All patches I showed are there
end
Appendix:Imaginary Numbers
•Matz wrote a patch in [ruby-dev:38843]
•translation:[ruby-core:24730]
• It won't be accepted
Appendix:Imaginary Numbers
> 3i=> (0 + 3i)> 3i.class=> Complex
Appendix
•{you <3 ruby}
• f(x, y) = z(like f[x, y] = z as f.[]=(x, y, z))
•Annotations!