Skip to content

Commit f113949

Browse files
author
Fabrice Bellard
committed
regexp: removed alloca() is lre_exec() - added specific opcodes for \s and \S to have a smaller bytecode - optimized \b and \B
1 parent 7bd1ae2 commit f113949

File tree

5 files changed

+94
-69
lines changed

5 files changed

+94
-69
lines changed

‎libregexp-opcode.h‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ DEF(char32, 5)
3131
DEF(char32_i, 5)
3232
DEF(dot, 1)
3333
DEF(any, 1) /* same as dot but match any character including line terminator */
34+
DEF(space, 1)
35+
DEF(not_space, 1) /* must come after */
3436
DEF(line_start, 1)
3537
DEF(line_start_m, 1)
3638
DEF(line_end, 1)

‎libregexp.c‎

Lines changed: 76 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434

3535
/*
3636
TODO:
37-
37+
- remove REOP_char_i and REOP_range_i by precomputing the case folding.
38+
- add specific opcodes for simple unicode property tests so that the
39+
generated bytecode is smaller.
3840
- Add a lock step execution mode (=linear time execution guaranteed)
3941
when the regular expression is "simple" i.e. no backreference nor
4042
complicated lookahead. The opcodes are designed for this execution
@@ -1078,7 +1080,7 @@ static int get_class_atom(REParseState *s, REStringList *cr,
10781080
goto default_escape;
10791081
if (cr_init_char_range(s, cr, c))
10801082
return -1;
1081-
c = CLASS_RANGE_BASE;
1083+
c += CLASS_RANGE_BASE;
10821084
break;
10831085
case 'c':
10841086
c = *p;
@@ -1584,6 +1586,8 @@ static BOOL re_need_check_adv_and_capture_init(BOOL *pneed_capture_init,
15841586
case REOP_char32_i:
15851587
case REOP_dot:
15861588
case REOP_any:
1589+
case REOP_space:
1590+
case REOP_not_space:
15871591
need_check_adv = FALSE;
15881592
break;
15891593
case REOP_line_start:
@@ -2028,9 +2032,9 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
20282032
case 'b':
20292033
case 'B':
20302034
if (p[1] != 'b') {
2031-
re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
2035+
re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
20322036
} else {
2033-
re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
2037+
re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
20342038
}
20352039
p += 2;
20362040
break;
@@ -2167,8 +2171,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
21672171
if (is_backward_dir)
21682172
re_emit_op(s, REOP_prev);
21692173
if (c >= CLASS_RANGE_BASE) {
2170-
int ret;
2171-
ret = re_emit_string_list(s, cr);
2174+
int ret = 0;
2175+
/* optimize the common 'space' tests */
2176+
if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
2177+
re_emit_op(s, REOP_space);
2178+
} else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
2179+
re_emit_op(s, REOP_not_space);
2180+
} else {
2181+
ret = re_emit_string_list(s, cr);
2182+
}
21722183
re_string_list_free(cr);
21732184
if (ret)
21742185
return -1;
@@ -2607,14 +2618,6 @@ static BOOL is_line_terminator(uint32_t c)
26072618
return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
26082619
}
26092620

2610-
static BOOL is_word_char(uint32_t c)
2611-
{
2612-
return ((c >= '0' && c <= '9') ||
2613-
(c >= 'a' && c <= 'z') ||
2614-
(c >= 'A' && c <= 'Z') ||
2615-
(c == '_'));
2616-
}
2617-
26182621
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
26192622
do { \
26202623
if (cbuf_type == 0) { \
@@ -2769,7 +2772,7 @@ static no_inline int stack_realloc(REExecContext *s, size_t n)
27692772

27702773
/* return 1 if match, 0 if not match or < 0 if error. */
27712774
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2772-
uint8_t **regs, const uint8_t *pc, const uint8_t *cptr)
2775+
const uint8_t *pc, const uint8_t *cptr)
27732776
{
27742777
int opcode;
27752778
int cbuf_type;
@@ -2809,24 +2812,24 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
28092812
}
28102813

28112814
/* avoid saving the previous value if already saved */
2812-
#define SAVE_REG(idx, value) \
2815+
#define SAVE_CAPTURE_CHECK(idx, value) \
28132816
{ \
28142817
StackElem *sp1; \
28152818
sp1 = sp; \
28162819
for(;;) { \
28172820
if (sp1 > bp) { \
2818-
if (sp1[-2].val == -(int)(idx + 1)) \
2821+
if (sp1[-2].val == idx) \
28192822
break; \
28202823
sp1 -= 2; \
28212824
} else { \
28222825
CHECK_STACK_SPACE(2); \
2823-
sp[0].val = -(int)(idx + 1); \
2824-
sp[1].ptr = regs[idx]; \
2826+
sp[0].val = idx; \
2827+
sp[1].ptr = capture[idx]; \
28252828
sp += 2; \
28262829
break; \
28272830
} \
28282831
} \
2829-
regs[idx] = (value); \
2832+
capture[idx] = (value); \
28302833
}
28312834

28322835

@@ -2851,13 +2854,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
28512854
REExecStateEnum type;
28522855
if (bp == s->stack_buf)
28532856
return 0;
2854-
/* undo the modifications to capture[] and regs[] */
2857+
/* undo the modifications to capture[] */
28552858
while (sp > bp) {
2856-
intptr_t idx2 = sp[-2].val;
2857-
if (idx2 >= 0)
2858-
capture[idx2] = sp[-1].ptr;
2859-
else
2860-
regs[-idx2 - 1] = sp[-1].ptr;
2859+
capture[sp[-2].val] = sp[-1].ptr;
28612860
sp -= 2;
28622861
}
28632862

@@ -2910,13 +2909,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
29102909
for(;;) {
29112910
REExecStateEnum type;
29122911
type = bp[-1].bp.type;
2913-
/* undo the modifications to capture[] and regs[] */
2912+
/* undo the modifications to capture[] */
29142913
while (sp > bp) {
2915-
intptr_t idx2 = sp[-2].val;
2916-
if (idx2 >= 0)
2917-
capture[idx2] = sp[-1].ptr;
2918-
else
2919-
regs[-idx2 - 1] = sp[-1].ptr;
2914+
capture[sp[-2].val] = sp[-1].ptr;
29202915
sp -= 2;
29212916
}
29222917
pc = sp[-3].ptr;
@@ -3019,6 +3014,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30193014
goto no_match;
30203015
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
30213016
break;
3017+
case REOP_space:
3018+
if (cptr == cbuf_end)
3019+
goto no_match;
3020+
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3021+
if (!lre_is_space(c))
3022+
goto no_match;
3023+
break;
3024+
case REOP_not_space:
3025+
if (cptr == cbuf_end)
3026+
goto no_match;
3027+
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
3028+
if (lre_is_space(c))
3029+
goto no_match;
3030+
break;
30223031
case REOP_save_start:
30233032
case REOP_save_end:
30243033
val = *pc++;
@@ -3044,20 +3053,20 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30443053
}
30453054
break;
30463055
case REOP_set_i32:
3047-
idx = pc[0];
3056+
idx = 2 * s->capture_count + pc[0];
30483057
val = get_u32(pc + 1);
30493058
pc += 5;
3050-
SAVE_REG(idx, (void *)(uintptr_t)val);
3059+
SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
30513060
break;
30523061
case REOP_loop:
30533062
{
30543063
uint32_t val2;
3055-
idx = pc[0];
3064+
idx = 2 * s->capture_count + pc[0];
30563065
val = get_u32(pc + 1);
30573066
pc += 5;
30583067

3059-
val2 = (uintptr_t)regs[idx] - 1;
3060-
SAVE_REG(idx, (void *)(uintptr_t)val2);
3068+
val2 = (uintptr_t)capture[idx] - 1;
3069+
SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
30613070
if (val2 != 0) {
30623071
pc += (int)val;
30633072
if (lre_poll_timeout(s))
@@ -3072,14 +3081,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30723081
{
30733082
const uint8_t *pc1;
30743083
uint32_t val2, limit;
3075-
idx = pc[0];
3084+
idx = 2 * s->capture_count + pc[0];
30763085
limit = get_u32(pc + 1);
30773086
val = get_u32(pc + 5);
30783087
pc += 9;
30793088

30803089
/* decrement the counter */
3081-
val2 = (uintptr_t)regs[idx] - 1;
3082-
SAVE_REG(idx, (void *)(uintptr_t)val2);
3090+
val2 = (uintptr_t)capture[idx] - 1;
3091+
SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
30833092

30843093
if (val2 > limit) {
30853094
/* normal loop if counter > limit */
@@ -3090,7 +3099,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
30903099
/* check advance */
30913100
if ((opcode == REOP_loop_check_adv_split_goto_first ||
30923101
opcode == REOP_loop_check_adv_split_next_first) &&
3093-
regs[idx + 1] == cptr &&
3102+
capture[idx + 1] == cptr &&
30943103
val2 != limit) {
30953104
goto no_match;
30963105
}
@@ -3116,14 +3125,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
31163125
}
31173126
break;
31183127
case REOP_set_char_pos:
3119-
idx = pc[0];
3128+
idx = 2 * s->capture_count + pc[0];
31203129
pc++;
3121-
SAVE_REG(idx, (uint8_t *)cptr);
3130+
SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
31223131
break;
31233132
case REOP_check_advance:
3124-
idx = pc[0];
3133+
idx = 2 * s->capture_count + pc[0];
31253134
pc++;
3126-
if (regs[idx] == cptr)
3135+
if (capture[idx] == cptr)
31273136
goto no_match;
31283137
break;
31293138
case REOP_word_boundary:
@@ -3139,18 +3148,22 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
31393148
v1 = FALSE;
31403149
} else {
31413150
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
3142-
if (ignore_case)
3143-
c = lre_canonicalize(c, s->is_unicode);
3144-
v1 = is_word_char(c);
3151+
if (c < 256) {
3152+
v1 = (lre_is_word_byte(c) != 0);
3153+
} else {
3154+
v1 = ignore_case && (c == 0x017f || c == 0x212a);
3155+
}
31453156
}
31463157
/* current char */
31473158
if (cptr >= cbuf_end) {
31483159
v2 = FALSE;
31493160
} else {
31503161
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
3151-
if (ignore_case)
3152-
c = lre_canonicalize(c, s->is_unicode);
3153-
v2 = is_word_char(c);
3162+
if (c < 256) {
3163+
v2 = (lre_is_word_byte(c) != 0);
3164+
} else {
3165+
v2 = ignore_case && (c == 0x017f || c == 0x212a);
3166+
}
31543167
}
31553168
if (v1 ^ v2 ^ is_boundary)
31563169
goto no_match;
@@ -3315,8 +3328,7 @@ int lre_exec(uint8_t **capture,
33153328
int cbuf_type, void *opaque)
33163329
{
33173330
REExecContext s_s, *s = &s_s;
3318-
int re_flags, i, ret, register_count;
3319-
uint8_t **regs;
3331+
int re_flags, i, ret;
33203332
const uint8_t *cptr;
33213333

33223334
re_flags = lre_get_flags(bc_buf);
@@ -3335,10 +3347,6 @@ int lre_exec(uint8_t **capture,
33353347

33363348
for(i = 0; i < s->capture_count * 2; i++)
33373349
capture[i] = NULL;
3338-
/* XXX: modify the API so that the registers are allocated after
3339-
the captures to suppress some tests */
3340-
register_count = bc_buf[RE_HEADER_REGISTER_COUNT];
3341-
regs = alloca(register_count * sizeof(regs[0]));
33423350

33433351
cptr = cbuf + (cindex << cbuf_type);
33443352
if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
@@ -3348,13 +3356,19 @@ int lre_exec(uint8_t **capture,
33483356
}
33493357
}
33503358

3351-
ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN,
3352-
cptr);
3359+
ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
3360+
33533361
if (s->stack_buf != s->static_stack_buf)
33543362
lre_realloc(s->opaque, s->stack_buf, 0);
33553363
return ret;
33563364
}
33573365

3366+
int lre_get_alloc_count(const uint8_t *bc_buf)
3367+
{
3368+
return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
3369+
bc_buf[RE_HEADER_REGISTER_COUNT];
3370+
}
3371+
33583372
int lre_get_capture_count(const uint8_t *bc_buf)
33593373
{
33603374
return bc_buf[RE_HEADER_CAPTURE_COUNT];
@@ -3393,7 +3407,7 @@ int main(int argc, char **argv)
33933407
int len, flags, ret, i;
33943408
uint8_t *bc;
33953409
char error_msg[64];
3396-
uint8_t *capture[CAPTURE_COUNT_MAX * 2];
3410+
uint8_t *capture;
33973411
const char *input;
33983412
int input_len, capture_count;
33993413

@@ -3412,6 +3426,7 @@ int main(int argc, char **argv)
34123426
input = argv[3];
34133427
input_len = strlen(input);
34143428

3429+
capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
34153430
ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
34163431
printf("ret=%d\n", ret);
34173432
if (ret == 1) {
@@ -3427,6 +3442,7 @@ int main(int argc, char **argv)
34273442
printf("\n");
34283443
}
34293444
}
3445+
free(capture);
34303446
return 0;
34313447
}
34323448
#endif

‎libregexp.h‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
4747
const char *buf, size_t buf_len, int re_flags,
4848
void *opaque);
49+
int lre_get_alloc_count(const uint8_t *bc_buf);
4950
int lre_get_capture_count(const uint8_t *bc_buf);
5051
int lre_get_flags(const uint8_t *bc_buf);
5152
const char *lre_get_groupnames(const uint8_t *bc_buf);

‎libunicode.h‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,11 @@ static inline int lre_is_id_continue_byte(uint8_t c) {
147147
UNICODE_C_DIGIT);
148148
}
149149

150+
static inline int lre_is_word_byte(uint8_t c) {
151+
return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
152+
UNICODE_C_UNDER | UNICODE_C_DIGIT);
153+
}
154+
150155
int lre_is_space_non_ascii(uint32_t c);
151156

152157
static inline int lre_is_space(uint32_t c) {

0 commit comments

Comments
 (0)