3434
3535/*
3636 TODO:
37-
37+ - remove REOP_char_i and REOP_range_i by precomputing the case folding.
38+ - add specific opcodes for simple unicode property tests so that the
39+ generated bytecode is smaller.
3840 - Add a lock step execution mode (=linear time execution guaranteed)
3941 when the regular expression is "simple" i.e. no backreference nor
4042 complicated lookahead. The opcodes are designed for this execution
@@ -1078,7 +1080,7 @@ static int get_class_atom(REParseState *s, REStringList *cr,
10781080 goto default_escape ;
10791081 if (cr_init_char_range (s , cr , c ))
10801082 return -1 ;
1081- c = CLASS_RANGE_BASE ;
1083+ c + = CLASS_RANGE_BASE ;
10821084 break ;
10831085 case 'c' :
10841086 c = * p ;
@@ -1584,6 +1586,8 @@ static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
15841586 case REOP_char32_i :
15851587 case REOP_dot :
15861588 case REOP_any :
1589+ case REOP_space :
1590+ case REOP_not_space :
15871591 need_check_adv = FALSE;
15881592 break ;
15891593 case REOP_line_start :
@@ -2028,9 +2032,9 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
20282032 case 'b' :
20292033 case 'B' :
20302034 if (p [1 ] != 'b' ) {
2031- re_emit_op (s , s -> ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary );
2035+ re_emit_op (s , s -> ignore_case && s -> is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary );
20322036 } else {
2033- re_emit_op (s , s -> ignore_case ? REOP_word_boundary_i : REOP_word_boundary );
2037+ re_emit_op (s , s -> ignore_case && s -> is_unicode ? REOP_word_boundary_i : REOP_word_boundary );
20342038 }
20352039 p += 2 ;
20362040 break ;
@@ -2167,8 +2171,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
21672171 if (is_backward_dir )
21682172 re_emit_op (s , REOP_prev );
21692173 if (c >= CLASS_RANGE_BASE ) {
2170- int ret ;
2171- ret = re_emit_string_list (s , cr );
2174+ int ret = 0 ;
2175+ /* optimize the common 'space' tests */
2176+ if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s )) {
2177+ re_emit_op (s , REOP_space );
2178+ } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S )) {
2179+ re_emit_op (s , REOP_not_space );
2180+ } else {
2181+ ret = re_emit_string_list (s , cr );
2182+ }
21722183 re_string_list_free (cr );
21732184 if (ret )
21742185 return -1 ;
@@ -2607,14 +2618,6 @@ static BOOL is_line_terminator(uint32_t c)
26072618 return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS );
26082619}
26092620
2610- static BOOL is_word_char (uint32_t c )
2611- {
2612- return ((c >= '0' && c <= '9' ) ||
2613- (c >= 'a' && c <= 'z' ) ||
2614- (c >= 'A' && c <= 'Z' ) ||
2615- (c == '_' ));
2616- }
2617-
26182621#define GET_CHAR (c , cptr , cbuf_end , cbuf_type ) \
26192622 do { \
26202623 if (cbuf_type == 0) { \
@@ -2769,7 +2772,7 @@ static no_inline int stack_realloc(REExecContext *s, size_t n)
27692772
27702773/* return 1 if match, 0 if not match or < 0 if error. */
27712774static intptr_t lre_exec_backtrack (REExecContext * s , uint8_t * * capture ,
2772- uint8_t * * regs , const uint8_t * pc , const uint8_t * cptr )
2775+ const uint8_t * pc , const uint8_t * cptr )
27732776{
27742777 int opcode ;
27752778 int cbuf_type ;
@@ -2809,24 +2812,24 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
28092812 }
28102813
28112814 /* avoid saving the previous value if already saved */
2812- #define SAVE_REG (idx , value ) \
2815+ #define SAVE_CAPTURE_CHECK (idx , value ) \
28132816 { \
28142817 StackElem *sp1; \
28152818 sp1 = sp; \
28162819 for(;;) { \
28172820 if (sp1 > bp) { \
2818- if (sp1[-2].val == -(int)( idx + 1)) \
2821+ if (sp1[-2].val == idx) \
28192822 break; \
28202823 sp1 -= 2; \
28212824 } else { \
28222825 CHECK_STACK_SPACE(2); \
2823- sp[0].val = -(int)( idx + 1); \
2824- sp[1].ptr = regs [idx]; \
2826+ sp[0].val = idx; \
2827+ sp[1].ptr = capture [idx]; \
28252828 sp += 2; \
28262829 break; \
28272830 } \
28282831 } \
2829- regs [idx] = (value); \
2832+ capture [idx] = (value); \
28302833 }
28312834
28322835
@@ -2851,13 +2854,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
28512854 REExecStateEnum type ;
28522855 if (bp == s -> stack_buf )
28532856 return 0 ;
2854- /* undo the modifications to capture[] and regs[] */
2857+ /* undo the modifications to capture[] */
28552858 while (sp > bp ) {
2856- intptr_t idx2 = sp [-2 ].val ;
2857- if (idx2 >= 0 )
2858- capture [idx2 ] = sp [-1 ].ptr ;
2859- else
2860- regs [- idx2 - 1 ] = sp [-1 ].ptr ;
2859+ capture [sp [-2 ].val ] = sp [-1 ].ptr ;
28612860 sp -= 2 ;
28622861 }
28632862
@@ -2910,13 +2909,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
29102909 for (;;) {
29112910 REExecStateEnum type ;
29122911 type = bp [-1 ].bp .type ;
2913- /* undo the modifications to capture[] and regs[] */
2912+ /* undo the modifications to capture[] */
29142913 while (sp > bp ) {
2915- intptr_t idx2 = sp [-2 ].val ;
2916- if (idx2 >= 0 )
2917- capture [idx2 ] = sp [-1 ].ptr ;
2918- else
2919- regs [- idx2 - 1 ] = sp [-1 ].ptr ;
2914+ capture [sp [-2 ].val ] = sp [-1 ].ptr ;
29202915 sp -= 2 ;
29212916 }
29222917 pc = sp [-3 ].ptr ;
@@ -3019,6 +3014,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30193014 goto no_match ;
30203015 GET_CHAR (c , cptr , cbuf_end , cbuf_type );
30213016 break ;
3017+ case REOP_space :
3018+ if (cptr == cbuf_end )
3019+ goto no_match ;
3020+ GET_CHAR (c , cptr , cbuf_end , cbuf_type );
3021+ if (!lre_is_space (c ))
3022+ goto no_match ;
3023+ break ;
3024+ case REOP_not_space :
3025+ if (cptr == cbuf_end )
3026+ goto no_match ;
3027+ GET_CHAR (c , cptr , cbuf_end , cbuf_type );
3028+ if (lre_is_space (c ))
3029+ goto no_match ;
3030+ break ;
30223031 case REOP_save_start :
30233032 case REOP_save_end :
30243033 val = * pc ++ ;
@@ -3044,20 +3053,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30443053 }
30453054 break ;
30463055 case REOP_set_i32 :
3047- idx = pc [0 ];
3056+ idx = 2 * s -> capture_count + pc [0 ];
30483057 val = get_u32 (pc + 1 );
30493058 pc += 5 ;
3050- SAVE_REG (idx , (void * )(uintptr_t )val );
3059+ SAVE_CAPTURE_CHECK (idx , (void * )(uintptr_t )val );
30513060 break ;
30523061 case REOP_loop :
30533062 {
30543063 uint32_t val2 ;
3055- idx = pc [0 ];
3064+ idx = 2 * s -> capture_count + pc [0 ];
30563065 val = get_u32 (pc + 1 );
30573066 pc += 5 ;
30583067
3059- val2 = (uintptr_t )regs [idx ] - 1 ;
3060- SAVE_REG (idx , (void * )(uintptr_t )val2 );
3068+ val2 = (uintptr_t )capture [idx ] - 1 ;
3069+ SAVE_CAPTURE_CHECK (idx , (void * )(uintptr_t )val2 );
30613070 if (val2 != 0 ) {
30623071 pc += (int )val ;
30633072 if (lre_poll_timeout (s ))
@@ -3072,14 +3081,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30723081 {
30733082 const uint8_t * pc1 ;
30743083 uint32_t val2 , limit ;
3075- idx = pc [0 ];
3084+ idx = 2 * s -> capture_count + pc [0 ];
30763085 limit = get_u32 (pc + 1 );
30773086 val = get_u32 (pc + 5 );
30783087 pc += 9 ;
30793088
30803089 /* decrement the counter */
3081- val2 = (uintptr_t )regs [idx ] - 1 ;
3082- SAVE_REG (idx , (void * )(uintptr_t )val2 );
3090+ val2 = (uintptr_t )capture [idx ] - 1 ;
3091+ SAVE_CAPTURE_CHECK (idx , (void * )(uintptr_t )val2 );
30833092
30843093 if (val2 > limit ) {
30853094 /* normal loop if counter > limit */
@@ -3090,7 +3099,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30903099 /* check advance */
30913100 if ((opcode == REOP_loop_check_adv_split_goto_first ||
30923101 opcode == REOP_loop_check_adv_split_next_first ) &&
3093- regs [idx + 1 ] == cptr &&
3102+ capture [idx + 1 ] == cptr &&
30943103 val2 != limit ) {
30953104 goto no_match ;
30963105 }
@@ -3116,14 +3125,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
31163125 }
31173126 break ;
31183127 case REOP_set_char_pos :
3119- idx = pc [0 ];
3128+ idx = 2 * s -> capture_count + pc [0 ];
31203129 pc ++ ;
3121- SAVE_REG (idx , (uint8_t * )cptr );
3130+ SAVE_CAPTURE_CHECK (idx , (uint8_t * )cptr );
31223131 break ;
31233132 case REOP_check_advance :
3124- idx = pc [0 ];
3133+ idx = 2 * s -> capture_count + pc [0 ];
31253134 pc ++ ;
3126- if (regs [idx ] == cptr )
3135+ if (capture [idx ] == cptr )
31273136 goto no_match ;
31283137 break ;
31293138 case REOP_word_boundary :
@@ -3139,18 +3148,22 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
31393148 v1 = FALSE;
31403149 } else {
31413150 PEEK_PREV_CHAR (c , cptr , s -> cbuf , cbuf_type );
3142- if (ignore_case )
3143- c = lre_canonicalize (c , s -> is_unicode );
3144- v1 = is_word_char (c );
3151+ if (c < 256 ) {
3152+ v1 = (lre_is_word_byte (c ) != 0 );
3153+ } else {
3154+ v1 = ignore_case && (c == 0x017f || c == 0x212a );
3155+ }
31453156 }
31463157 /* current char */
31473158 if (cptr >= cbuf_end ) {
31483159 v2 = FALSE;
31493160 } else {
31503161 PEEK_CHAR (c , cptr , cbuf_end , cbuf_type );
3151- if (ignore_case )
3152- c = lre_canonicalize (c , s -> is_unicode );
3153- v2 = is_word_char (c );
3162+ if (c < 256 ) {
3163+ v2 = (lre_is_word_byte (c ) != 0 );
3164+ } else {
3165+ v2 = ignore_case && (c == 0x017f || c == 0x212a );
3166+ }
31543167 }
31553168 if (v1 ^ v2 ^ is_boundary )
31563169 goto no_match ;
@@ -3315,8 +3328,7 @@ int lre_exec(uint8_t **capture,
33153328 int cbuf_type , void * opaque )
33163329{
33173330 REExecContext s_s , * s = & s_s ;
3318- int re_flags , i , ret , register_count ;
3319- uint8_t * * regs ;
3331+ int re_flags , i , ret ;
33203332 const uint8_t * cptr ;
33213333
33223334 re_flags = lre_get_flags (bc_buf );
@@ -3335,10 +3347,6 @@ int lre_exec(uint8_t **capture,
33353347
33363348 for (i = 0 ; i < s -> capture_count * 2 ; i ++ )
33373349 capture [i ] = NULL ;
3338- /* XXX: modify the API so that the registers are allocated after
3339- the captures to suppress some tests */
3340- register_count = bc_buf [RE_HEADER_REGISTER_COUNT ];
3341- regs = alloca (register_count * sizeof (regs [0 ]));
33423350
33433351 cptr = cbuf + (cindex << cbuf_type );
33443352 if (0 < cindex && cindex < clen && s -> cbuf_type == 2 ) {
@@ -3348,13 +3356,19 @@ int lre_exec(uint8_t **capture,
33483356 }
33493357 }
33503358
3351- ret = lre_exec_backtrack (s , capture , regs , bc_buf + RE_HEADER_LEN ,
3352- cptr );
3359+ ret = lre_exec_backtrack (s , capture , bc_buf + RE_HEADER_LEN , cptr );
3360+
33533361 if (s -> stack_buf != s -> static_stack_buf )
33543362 lre_realloc (s -> opaque , s -> stack_buf , 0 );
33553363 return ret ;
33563364}
33573365
3366+ int lre_get_alloc_count (const uint8_t * bc_buf )
3367+ {
3368+ return bc_buf [RE_HEADER_CAPTURE_COUNT ] * 2 +
3369+ bc_buf [RE_HEADER_REGISTER_COUNT ];
3370+ }
3371+
33583372int lre_get_capture_count (const uint8_t * bc_buf )
33593373{
33603374 return bc_buf [RE_HEADER_CAPTURE_COUNT ];
@@ -3393,7 +3407,7 @@ int main(int argc, char **argv)
33933407 int len , flags , ret , i ;
33943408 uint8_t * bc ;
33953409 char error_msg [64 ];
3396- uint8_t * capture [ CAPTURE_COUNT_MAX * 2 ] ;
3410+ uint8_t * capture ;
33973411 const char * input ;
33983412 int input_len , capture_count ;
33993413
@@ -3412,6 +3426,7 @@ int main(int argc, char **argv)
34123426 input = argv [3 ];
34133427 input_len = strlen (input );
34143428
3429+ capture = malloc (sizeof (capture [0 ]) * lre_get_alloc_count (bc ));
34153430 ret = lre_exec (capture , bc , (uint8_t * )input , 0 , input_len , 0 , NULL );
34163431 printf ("ret=%d\n" , ret );
34173432 if (ret == 1 ) {
@@ -3427,6 +3442,7 @@ int main(int argc, char **argv)
34273442 printf ("\n" );
34283443 }
34293444 }
3445+ free (capture );
34303446 return 0 ;
34313447}
34323448#endif
0 commit comments