diff --git a/ChangeLog b/ChangeLog index cca2cb31..9dbf8f97 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Wed Mar 22 00:14:34 CET 2006 Daniel Veillard + + * xmlregexp.c: applied patch from Youri Golovanov fixing bug + #316338 and adding a couple of optimizations in the regexp + compilation engine. + * test/regexp/bug316338 result/regexp/bug316338: added regression + tests based on the examples provided in the bug report. + Fri Mar 10 08:40:55 EST 2006 Daniel Veillard * c14n.c encoding.c xmlschemas.c xpath.c xpointer.c: fix a few diff --git a/result/regexp/bug316338 b/result/regexp/bug316338 new file mode 100644 index 00000000..1cd1ac0e --- /dev/null +++ b/result/regexp/bug316338 @@ -0,0 +1,20 @@ +Regexp: (((C|c)(([\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+ +C 433: Ok +C 433 12: Fail +C 433 123: Ok +C 433 123 456: Ok +C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail +Regexp: (((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+ +C 433: Fail +C 433 12: Fail +C 433 123: Fail +C 433 123 456: Ok +C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail +Regexp: (((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?){3})+[\s]*))+ +C 433: Fail +C 433 12: Fail +C 433 123: Fail +C 433 123 456: Fail +C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail +Regexp: (((M|m)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)[\s]*)|((L|l)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((H|h)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((V|v)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*)|((Q|q)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((S|s)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((A|a)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]+[0-1][\s]+[0-1][\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((Z|z)[\s]*))* +M 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail diff --git a/test/regexp/bug316338 b/test/regexp/bug316338 new file mode 100644 index 00000000..98c78385 --- /dev/null +++ b/test/regexp/bug316338 @@ -0,0 +1,20 @@ +=>(((C|c)(([\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+ +C 433 +C 433 12 +C 433 123 +C 433 123 456 +C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 +=>(((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+ +C 433 +C 433 12 +C 433 123 +C 433 123 456 +C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 +=>(((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?){3})+[\s]*))+ +C 433 +C 433 12 +C 433 123 +C 433 123 456 +C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 +=>(((M|m)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)[\s]*)|((L|l)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((H|h)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((V|v)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*)|((Q|q)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((S|s)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((A|a)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]+[0-1][\s]+[0-1][\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((Z|z)[\s]*))* +M 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 diff --git a/xmlregexp.c b/xmlregexp.c index 8d7ee97f..58f480da 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -1479,7 +1479,13 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, switch (atom->quant) { case XML_REGEXP_QUANT_OPT: atom->quant = XML_REGEXP_QUANT_ONCE; - xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); + /* + * transition done to the state after end of atom. + * 1. set transition from atom start to new state + * 2. set transition from atom end to this state. + */ + xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); + xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state); break; case XML_REGEXP_QUANT_MULT: atom->quant = XML_REGEXP_QUANT_ONCE; @@ -1522,8 +1528,6 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, atom->min = 0; atom->max = 0; atom->quant = XML_REGEXP_QUANT_ONCE; - xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, - atom->start, counter); if (to != NULL) { newstate = to; } else { @@ -1533,6 +1537,13 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, ctxt->state = newstate; xmlFAGenerateCountedTransition(ctxt, atom->stop, newstate, counter); + + /* + * first check count and if OK jump forward; + * if checking fail increment count and jump back + */ + xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, + atom->start, counter); } default: break; @@ -4299,6 +4310,15 @@ xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { if (exec->state->nbTrans > exec->transno + 1) { xmlFARegExecSave(exec); } + /* + * restart count for expressions like this ((abc){2})* + */ + if (trans->count >= 0) { +#ifdef DEBUG_REGEXP_EXEC + printf("Reset count %d\n", trans->count); +#endif + exec->counts[trans->count] = 0; + } if (trans->counter >= 0) { #ifdef DEBUG_REGEXP_EXEC printf("Increasing count %d\n", trans->counter); @@ -5112,19 +5132,23 @@ xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { /** * xmlFAParseBranch: * @ctxt: a regexp parser context + * @to: optional target to the end of the branch + * + * @to is used to optimize by removing duplicate path in automata + * in expressions like (a|b)(c|d) * * [2] branch ::= piece* - 8 */ static int -xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) { +xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { xmlRegStatePtr previous; int ret; previous = ctxt->state; ret = xmlFAParsePiece(ctxt); if (ret != 0) { - if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0) + if (xmlFAGenerateTransitions(ctxt, previous, + (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) return(-1); previous = ctxt->state; ctxt->atom = NULL; @@ -5132,8 +5156,8 @@ xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) { while ((ret != 0) && (ctxt->error == 0)) { ret = xmlFAParsePiece(ctxt); if (ret != 0) { - if (xmlFAGenerateTransitions(ctxt, previous, NULL, - ctxt->atom) < 0) + if (xmlFAGenerateTransitions(ctxt, previous, + (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) return(-1); previous = ctxt->state; ctxt->atom = NULL; @@ -5156,7 +5180,7 @@ xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { /* if not top start should have been generated by an epsilon trans */ start = ctxt->state; ctxt->end = NULL; - xmlFAParseBranch(ctxt); + xmlFAParseBranch(ctxt, NULL); if (top) { #ifdef DEBUG_REGEXP_GRAPH printf("State %d is final\n", ctxt->state->no); @@ -5172,15 +5196,7 @@ xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { NEXT; ctxt->state = start; ctxt->end = NULL; - xmlFAParseBranch(ctxt); - if (top) { - ctxt->state->type = XML_REGEXP_FINAL_STATE; -#ifdef DEBUG_REGEXP_GRAPH - printf("State %d is final\n", ctxt->state->no); -#endif - } else { - xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end); - } + xmlFAParseBranch(ctxt, end); } if (!top) { ctxt->state = end;