diff options
Diffstat (limited to 'v_windows/v/vlib/regex/regex.v')
-rw-r--r-- | v_windows/v/vlib/regex/regex.v | 2324 |
1 files changed, 2324 insertions, 0 deletions
diff --git a/v_windows/v/vlib/regex/regex.v b/v_windows/v/vlib/regex/regex.v new file mode 100644 index 0000000..9e630e1 --- /dev/null +++ b/v_windows/v/vlib/regex/regex.v @@ -0,0 +1,2324 @@ +/* +regex 1.0 alpha + +Copyright (c) 2019-2021 Dario Deledda. All rights reserved. +Use of this source code is governed by an MIT license +that can be found in the LICENSE file. + +This file contains regex module + +Know limitation: +- find is implemented in a trivial way +- not full compliant PCRE +- not compliant POSIX ERE +*/ +module regex + +import strings + +pub const ( + v_regex_version = '1.0 alpha' // regex module version + + max_code_len = 256 // default small base code len for the regex programs + max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 + // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode + spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] + // new line chars for now only '\n' + new_line_list = [`\n`, `\r`] + + // Results + no_match_found = -1 + + // Errors + compile_ok = 0 // the regex string compiled, all ok + err_char_unknown = -2 // the char used is unknow to the system + err_undefined = -3 // the compiler symbol is undefined + err_internal_error = -4 // Bug in the regex system!! + err_cc_alloc_overflow = -5 // memory for char class full!! + err_syntax_error = -6 // syntax error in regex compiling + err_groups_overflow = -7 // max number of groups reached + err_groups_max_nested = -8 // max number of nested group reached + err_group_not_balanced = -9 // group not balanced + err_group_qm_notation = -10 // group invalid notation +) + +const ( + //************************************* + // regex program instructions + //************************************* + ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char + // char class 11 0100 AA xxxxxxxx + // AA = 00 regular class + // AA = 01 Negated class ^ char + ist_char_class = 0xD1000000 // MASK + ist_char_class_pos = 0xD0000000 // char class normal [abc] + ist_char_class_neg = 0xD1000000 // char class negate [^abc] + // dot char 10 0110 xx xxxxxxxx + ist_dot_char = 0x98000000 // match any char except \n + // backslash chars 10 0100 xx xxxxxxxx + ist_bsls_char = 0x90000000 // backslash char + // OR | 10 010Y xx xxxxxxxx + ist_or_branch = 0x91000000 // OR case + // groups 10 010Y xx xxxxxxxx + ist_group_start = 0x92000000 // group start ( + ist_group_end = 0x94000000 // group end ) + // control instructions + ist_prog_end = u32(0x88000000) // 10 0010 xx xxxxxxxx + //************************************* +) + +/* +General Utilities +*/ +// utf8util_char_len calculate the length in bytes of a utf8 char +[inline] +fn utf8util_char_len(b byte) int { + return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1 +} + +// get_char get a char from position i and return an u32 with the unicode code +[direct_array_access; inline] +fn (re RE) get_char(in_txt string, i int) (u32, int) { + ini := unsafe { in_txt.str[i] } + // ascii 8 bit + if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 { + return u32(ini), 1 + } + // unicode char + char_len := utf8util_char_len(ini) + mut tmp := 0 + mut ch := u32(0) + for tmp < char_len { + ch = (ch << 8) | unsafe { in_txt.str[i + tmp] } + tmp++ + } + return ch, char_len +} + +// get_charb get a char from position i and return an u32 with the unicode code +[direct_array_access; inline] +fn (re RE) get_charb(in_txt &byte, i int) (u32, int) { + // ascii 8 bit + if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 { + return u32(unsafe { in_txt[i] }), 1 + } + // unicode char + char_len := utf8util_char_len(unsafe { in_txt[i] }) + mut tmp := 0 + mut ch := u32(0) + for tmp < char_len { + ch = (ch << 8) | unsafe { in_txt[i + tmp] } + tmp++ + } + return ch, char_len +} + +[inline] +fn is_alnum(in_char byte) bool { + mut tmp := in_char - `A` + if tmp <= 25 { + return true + } + tmp = in_char - `a` + if tmp <= 25 { + return true + } + tmp = in_char - `0` + if tmp <= 9 { + return true + } + if in_char == `_` { + return true + } + return false +} + +[inline] +fn is_not_alnum(in_char byte) bool { + return !is_alnum(in_char) +} + +[inline] +fn is_space(in_char byte) bool { + return in_char in regex.spaces +} + +[inline] +fn is_not_space(in_char byte) bool { + return !is_space(in_char) +} + +[inline] +fn is_digit(in_char byte) bool { + tmp := in_char - `0` + return tmp <= 0x09 +} + +[inline] +fn is_not_digit(in_char byte) bool { + return !is_digit(in_char) +} + +/* +[inline] +fn is_wordchar(in_char byte) bool { + return is_alnum(in_char) || in_char == `_` +} + +[inline] +fn is_not_wordchar(in_char byte) bool { + return !is_alnum(in_char) +} +*/ + +[inline] +fn is_lower(in_char byte) bool { + tmp := in_char - `a` + return tmp <= 25 +} + +[inline] +fn is_upper(in_char byte) bool { + tmp := in_char - `A` + return tmp <= 25 +} + +pub fn (re RE) get_parse_error_string(err int) string { + match err { + regex.compile_ok { return 'compile_ok' } + regex.no_match_found { return 'no_match_found' } + regex.err_char_unknown { return 'err_char_unknown' } + regex.err_undefined { return 'err_undefined' } + regex.err_internal_error { return 'err_internal_error' } + regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' } + regex.err_syntax_error { return 'err_syntax_error' } + regex.err_groups_overflow { return 'err_groups_overflow' } + regex.err_groups_max_nested { return 'err_groups_max_nested' } + regex.err_group_not_balanced { return 'err_group_not_balanced' } + regex.err_group_qm_notation { return 'err_group_qm_notation' } + else { return 'err_unknown' } + } +} + +// utf8_str convert and utf8 sequence to a printable string +[inline] +fn utf8_str(ch rune) string { + mut i := 4 + mut res := '' + for i > 0 { + v := byte((ch >> ((i - 1) * 8)) & 0xFF) + if v != 0 { + res += '${v:1c}' + } + i-- + } + return res +} + +// simple_log default log function +fn simple_log(txt string) { + print(txt) +} + +/****************************************************************************** +* +* Token Structs +* +******************************************************************************/ +pub type FnValidator = fn (byte) bool + +struct Token { +mut: + ist rune + // char + ch rune // char of the token if any + ch_len byte // char len + // Quantifiers / branch + rep_min int // used also for jump next in the OR branch [no match] pc jump + rep_max int // used also for jump next in the OR branch [ match] pc jump + greedy bool // greedy quantifier flag + // Char class + cc_index int = -1 + // counters for quantifier check (repetitions) + rep int + // validator function pointer + validator FnValidator + // groups variables + group_rep int // repetition of the group + group_id int = -1 // id of the group + goto_pc int = -1 // jump to this PC if is needed + // OR flag for the token + next_is_or bool // true if the next token is an OR + // dot_char token variables + dot_check_pc int = -1 // pc of the next token to check + last_dot_flag bool // if true indicate that is the last dot_char in the regex +} + +[inline] +fn (mut tok Token) reset() { + tok.rep = 0 +} + +/****************************************************************************** +* +* Regex struct +* +******************************************************************************/ +pub const ( + f_nl = 0x00000001 // end the match when find a new line symbol + f_ms = 0x00000002 // match true only if the match is at the start of the string + f_me = 0x00000004 // match true only if the match is at the end of the string + + f_efm = 0x00000100 // exit on first token matched, used by search + f_bin = 0x00000200 // work only on bytes, ignore utf-8 + // behaviour modifier flags + f_src = 0x00020000 // search mode enabled +) + +struct StateDotObj { +mut: + i int = -1 // char index in the input buffer + pc int = -1 // program counter saved + mi int = -1 // match_index saved + group_stack_index int = -1 // continuous save on capturing groups +} + +pub type FnLog = fn (string) + +pub struct RE { +pub mut: + prog []Token + prog_len int // regex program len + // char classes storage + cc []CharClass // char class list + cc_index int // index + // groups + group_count int // number of groups in this regex struct + groups []int // groups index results + group_max_nested int = 3 // max nested group + group_max int = 8 // max allowed number of different groups + + state_list []StateObj + + group_csave_flag bool // flag to enable continuous saving + group_csave []int //= []int{} // groups continuous save list + + group_map map[string]int // groups names map + + group_stack []int + group_data []int + // flags + flag int // flag for optional parameters + // Debug/log + debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE + log_func FnLog = simple_log // log function, can be customized by the user + query string // query string +} + +// Reset RE object +[direct_array_access; inline] +fn (mut re RE) reset() { + re.cc_index = 0 + + mut i := 0 + for i < re.prog_len { + re.prog[i].group_rep = 0 // clear repetition of the group + re.prog[i].rep = 0 // clear repetition of the token + i++ + } + + // init groups array + if re.group_count > 0 { + re.groups = []int{len: re.group_count * 2, init: -1} + } + + // reset group_csave + if re.group_csave_flag == true { + re.group_csave.clear() // = []int{} + } + + // reset state list + re.state_list.clear() + re.group_stack.clear() +} + +// reset for search mode fail +// gcc bug, dont use [inline] or go 5 time slower +//[inline] +[direct_array_access] +fn (mut re RE) reset_src() { + mut i := 0 + for i < re.prog_len { + re.prog[i].group_rep = 0 // clear repetition of the group + re.prog[i].rep = 0 // clear repetition of the token + i++ + } +} + +/****************************************************************************** +* +* Backslashes chars +* +******************************************************************************/ +struct BslsStruct { + ch rune // meta char + validator FnValidator // validator function pointer +} + +const ( + bsls_validator_array = [ + BslsStruct{`w`, is_alnum}, + BslsStruct{`W`, is_not_alnum}, + BslsStruct{`s`, is_space}, + BslsStruct{`S`, is_not_space}, + BslsStruct{`d`, is_digit}, + BslsStruct{`D`, is_not_digit}, + BslsStruct{`a`, is_lower}, + BslsStruct{`A`, is_upper}, + ] + + // these chars are escape if preceded by a \ + bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`, + `^`, `!`] +) + +enum BSLS_parse_state { + start + bsls_found + bsls_char + normal_char +} + +// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present +fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) { + mut status := BSLS_parse_state.start + mut i := in_i + + for i < in_txt.len { + // get our char + char_tmp, char_len := re.get_char(in_txt, i) + ch := byte(char_tmp) + + if status == .start && ch == `\\` { + status = .bsls_found + i += char_len + continue + } + + // check if is our bsls char, for now only one length sequence + if status == .bsls_found { + for c, x in regex.bsls_validator_array { + if x.ch == ch { + return c, i - in_i + 1 + } + } + status = .normal_char + continue + } + + // no BSLS validator, manage as normal escape char char + if status == .normal_char { + if ch in regex.bsls_escape_list { + return regex.no_match_found, i - in_i + 1 + } + return regex.err_syntax_error, i - in_i + 1 + } + + // at the present time we manage only one char after the \ + break + } + // not our bsls return KO + return regex.err_syntax_error, i +} + +/****************************************************************************** +* +* Char class +* +******************************************************************************/ +const ( + cc_null = 0 // empty cc token + cc_char = 1 // simple char: a + cc_int = 2 // char interval: a-z + cc_bsls = 3 // backslash char + cc_end = 4 // cc sequence terminator +) + +struct CharClass { +mut: + cc_type int = regex.cc_null // type of cc token + ch0 rune // first char of the interval a-b a in this case + ch1 rune // second char of the interval a-b b in this case + validator FnValidator // validator function pointer +} + +enum CharClass_parse_state { + start + in_char + in_bsls + separator + finish +} + +fn (re RE) get_char_class(pc int) string { + buf := []byte{len: (re.cc.len)} + mut buf_ptr := unsafe { &byte(&buf) } + + mut cc_i := re.prog[pc].cc_index + mut i := 0 + mut tmp := 0 + for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end { + if re.cc[cc_i].cc_type == regex.cc_bsls { + unsafe { + buf_ptr[i] = `\\` + i++ + buf_ptr[i] = byte(re.cc[cc_i].ch0) + i++ + } + } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 { + tmp = 3 + for tmp >= 0 { + x := byte((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) + if x != 0 { + unsafe { + buf_ptr[i] = x + i++ + } + } + tmp-- + } + } else { + tmp = 3 + for tmp >= 0 { + x := byte((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) + if x != 0 { + unsafe { + buf_ptr[i] = x + i++ + } + } + tmp-- + } + unsafe { + buf_ptr[i] = `-` + i++ + } + tmp = 3 + for tmp >= 0 { + x := byte((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF) + if x != 0 { + unsafe { + buf_ptr[i] = x + i++ + } + } + tmp-- + } + } + cc_i++ + } + unsafe { + buf_ptr[i] = byte(0) + } + return unsafe { tos_clone(buf_ptr) } +} + +fn (re RE) check_char_class(pc int, ch rune) bool { + mut cc_i := re.prog[pc].cc_index + for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end { + if re.cc[cc_i].cc_type == regex.cc_bsls { + if re.cc[cc_i].validator(byte(ch)) { + return true + } + } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 { + return true + } + cc_i++ + } + return false +} + +// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char +fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { + mut status := CharClass_parse_state.start + mut i := in_i + + mut tmp_index := re.cc_index + res_index := re.cc_index + + mut cc_type := u32(regex.ist_char_class_pos) + + for i < in_txt.len { + // check if we are out of memory for char classes + if tmp_index >= re.cc.len { + return regex.err_cc_alloc_overflow, 0, u32(0) + } + + // get our char + char_tmp, char_len := re.get_char(in_txt, i) + ch := byte(char_tmp) + + // println("CC #${i:3d} ch: ${ch:c}") + + // negation + if status == .start && ch == `^` { + cc_type = u32(regex.ist_char_class_neg) + i += char_len + continue + } + + // minus symbol + if status == .start && ch == `-` { + re.cc[tmp_index].cc_type = regex.cc_char + re.cc[tmp_index].ch0 = char_tmp + re.cc[tmp_index].ch1 = char_tmp + i += char_len + tmp_index++ + continue + } + + // bsls + if (status == .start || status == .in_char) && ch == `\\` { + // println("CC bsls.") + status = .in_bsls + i += char_len + continue + } + + if status == .in_bsls { + // println("CC bsls validation.") + for c, x in regex.bsls_validator_array { + if x.ch == ch { + // println("CC bsls found [${ch:c}]") + re.cc[tmp_index].cc_type = regex.cc_bsls + re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch + re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch + re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator + i += char_len + tmp_index++ + status = .in_char + break + } + } + if status == .in_bsls { + // manage as a simple char + // println("CC bsls not found [${ch:c}]") + re.cc[tmp_index].cc_type = regex.cc_char + re.cc[tmp_index].ch0 = char_tmp + re.cc[tmp_index].ch1 = char_tmp + i += char_len + tmp_index++ + status = .in_char + continue + } else { + continue + } + } + + // simple char + if (status == .start || status == .in_char) && ch != `-` && ch != `]` { + status = .in_char + + re.cc[tmp_index].cc_type = regex.cc_char + re.cc[tmp_index].ch0 = char_tmp + re.cc[tmp_index].ch1 = char_tmp + + i += char_len + tmp_index++ + continue + } + + // check range separator + if status == .in_char && ch == `-` { + status = .separator + i += char_len + continue + } + + // check range end + if status == .separator && ch != `]` && ch != `-` { + status = .in_char + re.cc[tmp_index - 1].cc_type = regex.cc_int + re.cc[tmp_index - 1].ch1 = char_tmp + i += char_len + continue + } + + // char class end + if status == .in_char && ch == `]` { + re.cc[tmp_index].cc_type = regex.cc_end + re.cc[tmp_index].ch0 = 0 + re.cc[tmp_index].ch1 = 0 + re.cc_index = tmp_index + 1 + + return res_index, i - in_i + 2, cc_type + } + + i++ + } + return regex.err_syntax_error, 0, u32(0) +} + +/****************************************************************************** +* +* Re Compiler +* +******************************************************************************/ +// +// Quantifier +// +enum Quant_parse_state { + start + min_parse + comma_checked + max_parse + greedy + gredy_parse + finish +} + +// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char +fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { + mut status := Quant_parse_state.start + mut i := in_i + + mut q_min := 0 // default min in a {} quantifier is 1 + mut q_max := 0 // deafult max in a {} quantifier is max_quantifier + + mut ch := byte(0) + + for i < in_txt.len { + unsafe { + ch = in_txt.str[i] + } + // println("${ch:c} status: $status") + + // exit on no compatible char with {} quantifier + if utf8util_char_len(ch) != 1 { + return regex.err_syntax_error, i, 0, false + } + + // min parsing skip if comma present + if status == .start && ch == `,` { + q_min = 0 // default min in a {} quantifier is 0 + status = .comma_checked + i++ + continue + } + + if status == .start && is_digit(ch) { + status = .min_parse + q_min *= 10 + q_min += int(ch - `0`) + i++ + continue + } + + if status == .min_parse && is_digit(ch) { + q_min *= 10 + q_min += int(ch - `0`) + i++ + continue + } + + // we have parsed the min, now check the max + if status == .min_parse && ch == `,` { + status = .comma_checked + i++ + continue + } + + // single value {4} + if status == .min_parse && ch == `}` { + q_max = q_min + status = .greedy + continue + } + + // end without max + if status == .comma_checked && ch == `}` { + q_max = regex.max_quantifier + status = .greedy + continue + } + + // start max parsing + if status == .comma_checked && is_digit(ch) { + status = .max_parse + q_max *= 10 + q_max += int(ch - `0`) + i++ + continue + } + + // parse the max + if status == .max_parse && is_digit(ch) { + q_max *= 10 + q_max += int(ch - `0`) + i++ + continue + } + + // finished the quantifier + if status == .max_parse && ch == `}` { + status = .greedy + continue + } + + // check if greedy flag char ? is present + if status == .greedy { + if i + 1 < in_txt.len { + i++ + status = .gredy_parse + continue + } + return q_min, q_max, i - in_i + 2, false + } + + // check the greedy flag + if status == .gredy_parse { + if ch == `?` { + return q_min, q_max, i - in_i + 2, true + } else { + i-- + return q_min, q_max, i - in_i + 2, false + } + } + + // not a {} quantifier, exit + return regex.err_syntax_error, i, 0, false + } + + // not a conform {} quantifier + return regex.err_syntax_error, i, 0, false +} + +// +// Groups +// +enum Group_parse_state { + start + q_mark // (? + q_mark1 // (?:|P checking + p_status // (?P + p_start // (?P< + p_end // (?P<...> + p_in_name // (?P<... + finish +} + +// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index) +fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) { + mut status := Group_parse_state.start + mut i := in_i + mut name := '' + + for i < in_txt.len && status != .finish { + // get our char + char_tmp, char_len := re.get_char(in_txt, i) + ch := byte(char_tmp) + + // start + if status == .start && ch == `(` { + status = .q_mark + i += char_len + continue + } + + // check for question marks + if status == .q_mark && ch == `?` { + status = .q_mark1 + i += char_len + continue + } + + // non capturing group + if status == .q_mark1 && ch == `:` { + i += char_len + return 0, false, name, i + } + + // enter in P section + if status == .q_mark1 && ch == `P` { + status = .p_status + i += char_len + continue + } + + // not a valid q mark found + if status == .q_mark1 { + // println("NO VALID Q MARK") + return -2, true, name, i + } + + if status == .p_status && ch == `<` { + status = .p_start + i += char_len + continue + } + + if status == .p_start && ch != `>` { + status = .p_in_name + name += '${ch:1c}' // TODO: manage utf8 chars + i += char_len + continue + } + + // colect name + if status == .p_in_name && ch != `>` && is_alnum(ch) { + name += '${ch:1c}' // TODO: manage utf8 chars + i += char_len + continue + } + + // end name + if status == .p_in_name && ch == `>` { + i += char_len + return 0, true, name, i + } + + // error on name group + if status == .p_in_name { + return -2, true, name, i + } + + // normal group, nothig to do, exit + return 0, true, name, i + } + // UNREACHABLE + // println("ERROR!! NOT MEANT TO BE HERE!!1") + return -2, true, name, i +} + +// +// main compiler +// +// compile return (return code, index) where index is the index of the error in the query string if return code is an error code +fn (mut re RE) impl_compile(in_txt string) (int, int) { + mut i := 0 // input string index + mut pc := 0 // program counter + + // group management variables + mut group_count := -1 + mut group_stack := []int{len: re.group_max_nested, init: 0} + mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1} + mut group_stack_index := -1 + + re.query = in_txt // save the query string + + i = 0 + for i < in_txt.len { + mut char_tmp := u32(0) + mut char_len := 0 + // println("i: ${i:3d} ch: ${in_txt.str[i]:c}") + + char_tmp, char_len = re.get_char(in_txt, i) + + // + // check special cases: $ ^ + // + if char_len == 1 && i == 0 && byte(char_tmp) == `^` { + re.flag = regex.f_ms + i = i + char_len + continue + } + if char_len == 1 && i == (in_txt.len - 1) && byte(char_tmp) == `$` { + re.flag = regex.f_me + i = i + char_len + continue + } + + // ist_group_start + if char_len == 1 && pc >= 0 && byte(char_tmp) == `(` { + // check max groups allowed + if group_count > re.group_max { + return regex.err_groups_overflow, i + 1 + } + group_stack_index++ + + // check max nested groups allowed + if group_stack_index > re.group_max_nested { + return regex.err_groups_max_nested, i + 1 + } + + tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i) + + // manage question mark format error + if tmp_res < -1 { + return regex.err_group_qm_notation, next_i + } + + // println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") + i = next_i + + if cgroup_flag == true { + group_count++ + } + + // calculate the group id + // if it is a named group, recycle the group id + // NOTE: **** the group index is +1 because map return 0 when not found!! **** + mut group_id := group_count + if cgroup_name.len > 0 { + // println("GROUP NAME: ${cgroup_name}") + if cgroup_name in re.group_map { + group_id = re.group_map[cgroup_name] - 1 + group_count-- + } else { + re.group_map[cgroup_name] = group_id + 1 + } + } + + group_stack_txt_index[group_stack_index] = i + group_stack[group_stack_index] = pc + + re.prog[pc].ist = u32(0) | regex.ist_group_start + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + + // set the group id + if cgroup_flag == false { + // println("NO CAPTURE GROUP") + re.prog[pc].group_id = -1 + } else { + re.prog[pc].group_id = group_id + } + + pc = pc + 1 + continue + } + + // ist_group_end + if char_len == 1 && pc > 0 && byte(char_tmp) == `)` { + if group_stack_index < 0 { + return regex.err_group_not_balanced, i + 1 + } + + goto_pc := group_stack[group_stack_index] + group_stack_index-- + + re.prog[pc].ist = u32(0) | regex.ist_group_end + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + + re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need + re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data + + re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc + // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data + + pc = pc + 1 + i = i + char_len + continue + } + + // ist_dot_char match any char except the following token + if char_len == 1 && pc >= 0 && byte(char_tmp) == `.` { + re.prog[pc].ist = u32(0) | regex.ist_dot_char + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + pc = pc + 1 + i = i + char_len + continue + } + + // OR branch + if char_len == 1 && pc > 0 && byte(char_tmp) == `|` { + // two consecutive ist_dot_char are an error + if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { + return regex.err_syntax_error, i + } + re.prog[pc].ist = u32(0) | regex.ist_or_branch + pc = pc + 1 + i = i + char_len + continue + } + + // Quantifiers + if char_len == 1 && pc > 0 { + mut quant_flag := true + match byte(char_tmp) { + `?` { + // println("q: ${char_tmp:c}") + re.prog[pc - 1].rep_min = 0 + re.prog[pc - 1].rep_max = 1 + } + `+` { + // println("q: ${char_tmp:c}") + re.prog[pc - 1].rep_min = 1 + re.prog[pc - 1].rep_max = regex.max_quantifier + } + `*` { + // println("q: ${char_tmp:c}") + re.prog[pc - 1].rep_min = 0 + re.prog[pc - 1].rep_max = regex.max_quantifier + } + `{` { + min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1) + // it is a quantifier + if min >= 0 { + // println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy") + i = i + tmp + re.prog[pc - 1].rep_min = min + re.prog[pc - 1].rep_max = max + re.prog[pc - 1].greedy = greedy + continue + } else { + return min, i + } + // TODO: decide if the open bracket can be conform without the close bracket + /* + // no conform, parse as normal char + else { + quant_flag = false + } + */ + } + else { + quant_flag = false + } + } + + if quant_flag { + i = i + char_len + continue + } + } + + // IST_CHAR_CLASS_* + if char_len == 1 && pc >= 0 { + if byte(char_tmp) == `[` { + cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1) + if cc_index >= 0 { + // println("index: $cc_index str:${in_txt[i..i+tmp]}") + i = i + tmp + re.prog[pc].ist = u32(0) | cc_type + re.prog[pc].cc_index = cc_index + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + pc = pc + 1 + continue + } + // cc_class vector memory full + else if cc_index < 0 { + return cc_index, i + } + } + } + + // ist_bsls_char + if char_len == 1 && pc >= 0 { + if byte(char_tmp) == `\\` { + bsls_index, tmp := re.parse_bsls(in_txt, i) + // println("index: $bsls_index str:${in_txt[i..i+tmp]}") + if bsls_index >= 0 { + i = i + tmp + re.prog[pc].ist = u32(0) | regex.ist_bsls_char + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator + re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch + pc = pc + 1 + continue + } + // this is an escape char, skip the bsls and continue as a normal char + else if bsls_index == regex.no_match_found { + i += char_len + char_tmp, char_len = re.get_char(in_txt, i) + // continue as simple char + } + // if not an escape or a bsls char then it is an error (at least for now!) + else { + return bsls_index, i + tmp + } + } + } + + // ist_simple_char + re.prog[pc].ist = regex.ist_simple_char + re.prog[pc].ch = char_tmp + re.prog[pc].ch_len = byte(char_len) + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + // println("char: ${char_tmp:c}") + pc = pc + 1 + + i += char_len + } + + // add end of the program + re.prog[pc].ist = regex.ist_prog_end + re.prog_len = pc + + // check for unbalanced groups + if group_stack_index != -1 { + return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1 + } + + // check for OR at the end of the program + if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { + return regex.err_syntax_error, in_txt.len + } + + // store the number of groups in the query + re.group_count = group_count + 1 + + //****************************************** + // Post processing + //****************************************** + + // + // manage ist_dot_char + // + + // find the checks for dot chars, if any... + mut pc1 := 0 + mut dot_char_count := 0 + mut last_dot_char_pc := -1 + for pc1 < pc { + if re.prog[pc1].ist == regex.ist_dot_char { + // println("Dot_char pc: $pc1") + last_dot_char_pc = pc1 + dot_char_count++ + mut pc2 := pc1 + 1 + for pc2 < pc { + if re.prog[pc2].ist == regex.ist_dot_char { + return regex.err_syntax_error, 0 + } + if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end, + regex.ist_group_start, + ] { + // println("Next dot char check is PC: ${pc2}") + re.prog[pc1].dot_check_pc = pc2 + break + } + pc2++ + } + } + pc1++ + } + + // println("last_dot_char_pc: $last_dot_char_pc") + if last_dot_char_pc >= 0 { + pc1 = last_dot_char_pc + 1 + mut is_last_dot := true + for pc1 < pc { + if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] { + is_last_dot = false + break + } + pc1++ + } + if is_last_dot { + re.prog[last_dot_char_pc].last_dot_flag = true + } + } + + //****************************************** + + // OR branch + // a|b|cd + // d exit point + // a,b,c branches + // set the jump in the right places + pc1 = 0 + for pc1 < pc - 2 { + // println("Here $pc1 ${pc-2}") + // two consecutive OR are a syntax error + if re.prog[pc1 + 1].ist == regex.ist_or_branch + && re.prog[pc1 + 2].ist == regex.ist_or_branch { + return regex.err_syntax_error, i + } + + // manange a|b chains like a|(b)|c|d... + // standard solution + if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch + && re.prog[pc1 + 2].ist != regex.ist_or_branch { + re.prog[pc1].next_is_or = true // set that the next token is an OR + re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump + + // match jump, if an OR chain the next token will be an OR token + mut pc2 := pc1 + 2 + for pc2 < pc - 1 { + ist := re.prog[pc2].ist + if ist == regex.ist_group_start { + re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1 + break + } + if ist != regex.ist_or_branch { + re.prog[pc1 + 1].rep_max = pc2 + 1 + break + } + + pc2++ + } + // special case query of few chars, teh true can't go on the first instruction + if re.prog[pc1 + 1].rep_max == pc1 { + re.prog[pc1 + 1].rep_max = 3 + } + // println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]") + pc1 = pc2 + continue + } + + pc1++ + } + + //****************************************** + // DEBUG PRINT REGEX GENERATED CODE + //****************************************** + if re.debug > 0 { + gc := re.get_code() + re.log_func(gc) + } + //****************************************** + + return regex.compile_ok, 0 +} + +// get_code return the compiled code as regex string, note: may be different from the source! +pub fn (re RE) get_code() string { + mut pc1 := 0 + mut res := strings.new_builder(re.cc.len * 2 * re.prog.len) + res.write_string('========================================\nv RegEx compiler v $regex.v_regex_version output:\n') + + mut stop_flag := false + + for pc1 <= re.prog.len { + tk := re.prog[pc1] + res.write_string('PC:${pc1:3d}') + + res.write_string(' ist: ') + res.write_string('${tk.ist:8x}'.replace(' ', '0')) + res.write_string(' ') + ist := tk.ist + if ist == regex.ist_bsls_char { + res.write_string('[\\${tk.ch:1c}] BSLS') + } else if ist == regex.ist_prog_end { + res.write_string('PROG_END') + stop_flag = true + } else if ist == regex.ist_or_branch { + res.write_string('OR ') + } else if ist == regex.ist_char_class_pos { + res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS') + } else if ist == regex.ist_char_class_neg { + res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG') + } else if ist == regex.ist_dot_char { + res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc') + if tk.last_dot_flag == true { + res.write_string(' last!') + } + } else if ist == regex.ist_group_start { + res.write_string('( GROUP_START #:$tk.group_id') + if tk.group_id == -1 { + res.write_string(' ?:') + } else { + for x in re.group_map.keys() { + if re.group_map[x] == (tk.group_id + 1) { + res.write_string(' ?P<$x>') + break + } + } + } + } else if ist == regex.ist_group_end { + res.write_string(') GROUP_END #:$tk.group_id') + } else if ist == regex.ist_simple_char { + res.write_string('[${tk.ch:1c}] query_ch') + } + + if tk.rep_max == regex.max_quantifier { + res.write_string(' {${tk.rep_min:3d},MAX}') + } else { + if ist == regex.ist_or_branch { + res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}') + } else { + res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}') + } + if tk.greedy == true { + res.write_string('?') + } + } + + res.write_string('\n') + if stop_flag { + break + } + pc1++ + } + + res.write_string('========================================\n') + return res.str() +} + +// get_query return a string with a reconstruction of the query starting from the regex program code +pub fn (re RE) get_query() string { + mut res := strings.new_builder(re.query.len * 2) + + if (re.flag & regex.f_ms) != 0 { + res.write_string('^') + } + + mut i := 0 + for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 { + tk := unsafe { &re.prog[i] } + ch := tk.ist + + // GROUP start + if ch == regex.ist_group_start { + if re.debug == 0 { + res.write_string('(') + } else { + if tk.group_id == -1 { + res.write_string('(?:') // non capturing group + } else { + res.write_string('#${tk.group_id}(') + } + } + + for x in re.group_map.keys() { + if re.group_map[x] == (tk.group_id + 1) { + res.write_string('?P<$x>') + break + } + } + + i++ + continue + } + + // GROUP end + if ch == regex.ist_group_end { + res.write_string(')') + } + + // OR branch + if ch == regex.ist_or_branch { + res.write_string('|') + if re.debug > 0 { + res.write_string('{$tk.rep_min,$tk.rep_max}') + } + i++ + continue + } + + // char class + if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos { + res.write_string('[') + if ch == regex.ist_char_class_neg { + res.write_string('^') + } + res.write_string('${re.get_char_class(i)}') + res.write_string(']') + } + + // bsls char + if ch == regex.ist_bsls_char { + res.write_string('\\${tk.ch:1c}') + } + + // ist_dot_char + if ch == regex.ist_dot_char { + res.write_string('.') + } + + // char alone + if ch == regex.ist_simple_char { + if byte(ch) in regex.bsls_escape_list { + res.write_string('\\') + } + res.write_string('${tk.ch:c}') + } + + // quantifier + if !(tk.rep_min == 1 && tk.rep_max == 1) { + if tk.rep_min == 0 && tk.rep_max == 1 { + res.write_string('?') + } else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier { + res.write_string('+') + } else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier { + res.write_string('*') + } else { + if tk.rep_max == regex.max_quantifier { + res.write_string('{$tk.rep_min,MAX}') + } else { + res.write_string('{$tk.rep_min,$tk.rep_max}') + } + if tk.greedy == true { + res.write_string('?') + } + } + } + i++ + } + if (re.flag & regex.f_me) != 0 { + res.write_string('$') + } + + return res.str() +} + +/****************************************************************************** +* +* Groups saving utilities +* +******************************************************************************/ +[direct_array_access] +fn (mut re RE) group_continuous_save(g_index int) { + if re.group_csave_flag == true { + // continuous save, save until we have space + + // init the first element as counter + if re.group_csave.len == 0 { + re.group_csave << 0 + } + + gi := g_index >> 1 + start := re.groups[g_index] + end := re.groups[g_index + 1] + + // check if we are simply increasing the size ot the found group + if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3] + && start == re.group_csave[re.group_csave.len - 2] { + re.group_csave[re.group_csave.len - 1] = end + return + } + + // otherwise append a new group to the list + + // increment counter + re.group_csave[0]++ + // save the record + re.group_csave << (g_index >> 1) // group id + re.group_csave << re.groups[g_index] // start + re.group_csave << re.groups[g_index + 1] // end + } +} + +/****************************************************************************** +* +* Matching +* +******************************************************************************/ +enum Match_state { + start = 0 + stop + end + new_line + ist_load // load and execute instruction + ist_next // go to next instruction + ist_next_ks // go to next instruction without clenaning the state + ist_quant_p // match positive ,quantifier check + ist_quant_n // match negative, quantifier check + ist_quant_pg // match positive ,group quantifier check + ist_quant_ng // match negative ,group quantifier check +} + +fn state_str(s Match_state) string { + match s { + .start { return 'start' } + .stop { return 'stop' } + .end { return 'end' } + .new_line { return 'new line' } + .ist_load { return 'ist_load' } + .ist_next { return 'ist_next' } + .ist_next_ks { return 'ist_next_ks' } + .ist_quant_p { return 'ist_quant_p' } + .ist_quant_n { return 'ist_quant_n' } + .ist_quant_pg { return 'ist_quant_pg' } + .ist_quant_ng { return 'ist_quant_ng' } + } +} + +struct StateObj { +pub mut: + group_index int = -1 // group id used to know how many groups are open + match_flag bool // indicate if we are in a match condition + match_index int = -1 // index of the last match + first_match int = -1 // index of the first match + pc int = -1 // program counter + i int = -1 // source string index + char_len int // last char legth + last_dot_pc int = -1 // last dot chat pc +} + +[direct_array_access] +pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { + // result status + mut result := regex.no_match_found // function return + + mut ch := rune(0) // examinated char + mut char_len := 0 // utf8 examinated char len + mut m_state := Match_state.start // start point for the matcher FSM + mut src_end := false + mut last_fnd_pc := -1 + + mut state := StateObj{} // actual state + mut ist := rune(0) // actual instruction + mut l_ist := rune(0) // last matched instruction + + mut step_count := 0 // stats for debug + mut dbg_line := 0 // count debug line printed + + re.reset() + + if re.debug > 0 { + // print header + mut h_buf := strings.new_builder(32) + h_buf.write_string('flags: ') + h_buf.write_string('${re.flag:8x}'.replace(' ', '0')) + h_buf.write_string('\n') + sss := h_buf.str() + re.log_func(sss) + } + + for m_state != .end { + if state.pc >= 0 && state.pc < re.prog.len { + ist = re.prog[state.pc].ist + } else if state.pc >= re.prog.len { + // println("ERROR!! PC overflow!!") + return regex.err_internal_error, state.i + } + + //****************************************** + // DEBUG LOG + //****************************************** + if re.debug > 0 { + mut buf2 := strings.new_builder(re.cc.len + 128) + + // print all the instructions + + // end of the input text + if state.i >= in_txt_len { + buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n') + sss := buf2.str() + re.log_func(sss) + } else { + // print only the exe instruction + if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { + if ist == regex.ist_prog_end { + buf2.write_string('# ${step_count:3d} PROG_END\n') + } else if ist == 0 || m_state in [.start, .ist_next, .stop] { + buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n') + } else { + ch, char_len = re.get_charb(in_txt, state.i) + + buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>') + buf2.write_string('${ist:8x}'.replace(' ', '0')) + buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',$char_len] f.m:[${state.first_match:3d},${state.match_index:3d}] ") + + if ist == regex.ist_simple_char { + buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]') + } else { + if ist == regex.ist_bsls_char { + buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]') + } else if ist == regex.ist_prog_end { + buf2.write_string('PROG_END') + } else if ist == regex.ist_or_branch { + buf2.write_string('OR') + } else if ist == regex.ist_char_class_pos { + buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]') + } else if ist == regex.ist_char_class_neg { + buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]') + } else if ist == regex.ist_dot_char { + buf2.write_string('DOT_CHAR') + } else if ist == regex.ist_group_start { + tmp_gi := re.prog[state.pc].group_id + tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep + buf2.write_string('GROUP_START #:$tmp_gi rep:$tmp_gr ') + } else if ist == regex.ist_group_end { + buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:$state.group_index') + } + } + if re.prog[state.pc].rep_max == regex.max_quantifier { + buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}') + } else { + buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}') + } + if re.prog[state.pc].greedy == true { + buf2.write_string('?') + } + buf2.write_string(' (#$state.group_index)') + + if ist == regex.ist_dot_char { + buf2.write_string(' last!') + } + + buf2.write_string('\n') + } + sss2 := buf2.str() + re.log_func(sss2) + } + } + step_count++ + dbg_line++ + } + //****************************************** + + if ist == regex.ist_prog_end { + // println("HERE we end!") + break + } + + // we're out of text, manage it + if state.i >= in_txt_len || m_state == .new_line { + // println("Finished text!!") + src_end = true + + // manage groups + if state.group_index >= 0 && state.match_index >= 0 { + // println("End text with open groups!") + // close the groups + for state.group_index >= 0 { + tmp_pc := re.group_data[state.group_index] + re.prog[tmp_pc].group_rep++ + // println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}") + + if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min + && re.prog[tmp_pc].group_id >= 0 { + start_i := re.group_stack[state.group_index] + re.group_stack[state.group_index] = -1 + + // save group results + g_index := re.prog[tmp_pc].group_id * 2 + if start_i >= 0 { + re.groups[g_index] = start_i + } else { + re.groups[g_index] = 0 + } + // we have fished the text, we must manage out pf bound indexes + if state.i >= in_txt_len { + state.i = in_txt_len - 1 + } + re.groups[g_index + 1] = state.i + + if re.groups[g_index + 1] >= in_txt_len { + // println("clamp group on stop!") + re.groups[g_index + 1] = in_txt_len - 1 + } + + // continuous save, save until we have space + re.group_continuous_save(g_index) + } + state.group_index-- + } + } + + // the text is finished and the groups closed and we are the last group, ok exit + if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end { + // println("Last group end") + return state.first_match, state.i + } + + if state.pc == -1 { + state.pc = last_fnd_pc + } + + // println("Finished text!!") + // println("Instruction: ${ist:08x} pc: $state.pc") + // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}") + + // program end + if ist == regex.ist_prog_end { + // println("Program end on end of text!") + return state.first_match, state.i + } + + // we are in a last dot_ char case + if l_ist == regex.ist_dot_char { + // println("***** We have a last dot_char") + // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}") + // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}") + // println("first match: ${state.first_match}") + if re.prog[state.pc].last_dot_flag == true + && re.prog[state.pc].rep >= re.prog[state.pc].rep_min + && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { + return state.first_match, state.i + } + // println("Not fitted!!") + } + + // m_state = .end + // break + return regex.no_match_found, 0 + } + + // starting and init + if m_state == .start { + state.pc = -1 + state.i = 0 + m_state = .ist_next + continue + } + // ist_next, next instruction reseting its state + else if m_state == .ist_next { + state.pc = state.pc + 1 + re.prog[state.pc].reset() + // check if we are in the program bounds + if state.pc < 0 || state.pc > re.prog.len { + // println("ERROR!! PC overflow!!") + return regex.err_internal_error, state.i + } + m_state = .ist_load + continue + } + // ist_next_ks, next instruction keeping its state + else if m_state == .ist_next_ks { + state.pc = state.pc + 1 + // check if we are in the program bounds + if state.pc < 0 || state.pc > re.prog.len { + // println("ERROR!! PC overflow!!") + return regex.err_internal_error, state.i + } + m_state = .ist_load + continue + } + + // load the char + ch, char_len = re.get_charb(in_txt, state.i) + + // check new line if flag f_nl enabled + if (re.flag & regex.f_nl) != 0 && char_len == 1 && byte(ch) in regex.new_line_list { + m_state = .new_line + continue + } + // check if stop + else if m_state == .stop { + // we are in search mode, don't exit until the end + if ((re.flag & regex.f_src) != 0) && (ist != regex.ist_prog_end) { + last_fnd_pc = state.pc + state.pc = -1 + state.i += char_len + + m_state = .ist_next + re.reset_src() + state.match_index = -1 + state.first_match = -1 + + // reset state list + re.reset() + + continue + } + + if ist == regex.ist_prog_end { + return state.first_match, state.i + } + + // manage here dot char + + if re.state_list.len > 0 { + // println("Here we are, with stop: state buffer: [${re.state_list.len}]") + state = re.state_list.pop() + + state.match_flag = true + l_ist = u32(regex.ist_dot_char) + + if state.first_match < 0 { + state.first_match = state.i + } + state.match_index = state.i + re.prog[state.pc].rep++ // increase repetitions + + state.i += char_len + m_state = .ist_quant_p + continue + } + + // exit on no match + return result, 0 + } + // ist_load + else if m_state == .ist_load { + // program end + if ist == regex.ist_prog_end { + // if we are in match exit well + + if state.group_index >= 0 && state.match_index >= 0 { + state.group_index = -1 + } + + m_state = .stop + continue + } + // check GROUP start, no quantifier is checkd for this token!! + else if ist == regex.ist_group_start { + state.group_index++ + re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape + re.group_stack[state.group_index] = state.i // index where we start to manage + // println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}") + + m_state = .ist_next + continue + } + // check GROUP end + else if ist == regex.ist_group_end { + // we are in matching streak + // println("Group END!! last ist: ${l_ist:08x}") + if state.match_index >= 0 { + // restore txt index stack and save the group data + + // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") + if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { + start_i := re.group_stack[state.group_index] + + // save group results + g_index := re.prog[state.pc].group_id * 2 + + if start_i >= 0 { + re.groups[g_index] = start_i + } else { + re.groups[g_index] = 0 + } + + re.groups[g_index + 1] = state.i + + if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] { + re.groups[g_index] = re.groups[g_index - 1] + } + + if re.groups[g_index + 1] >= in_txt_len { + // println("clamp group!") + re.groups[g_index + 1] = in_txt_len - 1 + } + + // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len") + + // continuous save, save until we have space + re.group_continuous_save(g_index) + } + + re.prog[state.pc].group_rep++ // increase repetitions + // println("GROUP $group_index END ${re.prog[state.pc].group_rep}") + m_state = .ist_quant_pg + continue + } + + m_state = .ist_quant_ng + continue + } + // check OR + else if ist == regex.ist_or_branch { + if state.match_index >= 0 { + state.pc = re.prog[state.pc].rep_max + // println("ist_or_branch True pc: $state.pc") + } else { + state.pc = re.prog[state.pc].rep_min + // println("ist_or_branch False pc: $state.pc") + } + re.prog[state.pc].reset() + m_state = .ist_load + continue + } + // check ist_dot_char + else if ist == regex.ist_dot_char { + // println("ist_dot_char rep: ${re.prog[state.pc].rep}") + + // check next token to be false + mut next_check_flag := false + + // if we are done with max go on dot char are dedicated case!! + if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { + re.state_list.pop() + m_state = .ist_next + continue + } + + if re.prog[state.pc].dot_check_pc >= 0 + && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { + // load the char + // ch_t, _ := re.get_charb(in_txt, state.i+char_len) + ch_t := ch + chk_pc := re.prog[state.pc].dot_check_pc + + // simple char + if re.prog[chk_pc].ist == regex.ist_simple_char { + if re.prog[chk_pc].ch == ch_t { + next_check_flag = true + } + // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") + } + // char char_class + else if re.prog[chk_pc].ist == regex.ist_char_class_pos + || re.prog[chk_pc].ist == regex.ist_char_class_neg { + mut cc_neg := false + if re.prog[chk_pc].ist == regex.ist_char_class_neg { + cc_neg = true + } + mut cc_res := re.check_char_class(chk_pc, ch_t) + + if cc_neg { + cc_res = !cc_res + } + next_check_flag = cc_res + // println("Check [ist_char_class] => $next_check_flag") + } + // check bsls + else if re.prog[chk_pc].ist == regex.ist_bsls_char { + next_check_flag = re.prog[chk_pc].validator(byte(ch_t)) + // println("Check [ist_bsls_char] => $next_check_flag") + } + } + + // check if we must continue or pass to the next IST + if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end { + // println("save the state!!") + mut dot_state := StateObj{ + group_index: state.group_index + match_flag: state.match_flag + match_index: state.match_index + first_match: state.first_match + pc: state.pc + i: state.i + char_len + char_len: char_len + last_dot_pc: state.pc + } + // if we are mananging a .* stay on the same char on return + if re.prog[state.pc].rep_min == 0 { + dot_state.i -= char_len + } + + re.state_list << dot_state + + m_state = .ist_quant_n + // println("dot_char stack len: ${re.state_list.len}") + continue + } + + state.match_flag = true + l_ist = u32(regex.ist_dot_char) + + if state.first_match < 0 { + state.first_match = state.i + } + state.match_index = state.i + re.prog[state.pc].rep++ // increase repetitions + + state.i += char_len + m_state = .ist_quant_p + continue + } + // char class IST + else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg { + state.match_flag = false + mut cc_neg := false + + if ist == regex.ist_char_class_neg { + cc_neg = true + } + mut cc_res := re.check_char_class(state.pc, ch) + + if cc_neg { + cc_res = !cc_res + } + + if cc_res { + state.match_flag = true + l_ist = u32(regex.ist_char_class_pos) + + if state.first_match < 0 { + state.first_match = state.i + } + + state.match_index = state.i + + re.prog[state.pc].rep++ // increase repetitions + state.i += char_len // next char + m_state = .ist_quant_p + continue + } + m_state = .ist_quant_n + continue + } + // check bsls + else if ist == regex.ist_bsls_char { + state.match_flag = false + tmp_res := re.prog[state.pc].validator(byte(ch)) + // println("BSLS in_ch: ${ch:c} res: $tmp_res") + if tmp_res { + state.match_flag = true + l_ist = u32(regex.ist_bsls_char) + + if state.first_match < 0 { + state.first_match = state.i + } + + state.match_index = state.i + + re.prog[state.pc].rep++ // increase repetitions + state.i += char_len // next char + m_state = .ist_quant_p + continue + } + m_state = .ist_quant_n + continue + } + // simple char IST + else if ist == regex.ist_simple_char { + // println("ist_simple_char") + state.match_flag = false + + if re.prog[state.pc].ch == ch { + state.match_flag = true + l_ist = regex.ist_simple_char + + if state.first_match < 0 { + state.first_match = state.i + } + // println("state.match_index: ${state.match_index}") + state.match_index = state.i + + re.prog[state.pc].rep++ // increase repetitions + state.i += char_len // next char + m_state = .ist_quant_p + continue + } + m_state = .ist_quant_n + continue + } + // UNREACHABLE + // println("PANIC2!! state: $m_state") + return regex.err_internal_error, state.i + } + /*********************************** + * Quantifier management + ***********************************/ + // ist_quant_ng => quantifier negative test on group + else if m_state == .ist_quant_ng { + // we are finished here + if state.group_index < 0 { + // println("Early stop!") + result = regex.no_match_found + m_state = .stop + continue + } + + tmp_pc := re.group_data[state.group_index] // PC to the end of the group token + rep := re.prog[tmp_pc].group_rep // use a temp variable + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + + // println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep") + + if rep >= re.prog[tmp_pc].rep_min { + // println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index") + + state.i = re.group_stack[state.group_index] + state.pc = tmp_pc + state.group_index-- + m_state = .ist_next + continue + } else if re.prog[tmp_pc].next_is_or { + // println("ist_quant_ng OR Negative branch") + + state.i = re.group_stack[state.group_index] + state.pc = re.prog[tmp_pc + 1].rep_min - 1 + state.group_index-- + m_state = .ist_next + continue + } else if rep > 0 && rep < re.prog[tmp_pc].rep_min { + // println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index") + + // check if we are inside a group, if yes exit from the nested groups + if state.group_index > 0 { + state.group_index-- + state.pc = tmp_pc + m_state = .ist_quant_ng //.ist_next + continue + } + + if state.group_index == 0 { + state.group_index-- + state.pc = tmp_pc // TEST + m_state = .ist_next + continue + } + + result = regex.no_match_found + m_state = .stop + continue + } else if rep == 0 && rep < re.prog[tmp_pc].rep_min { + // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index") + + if state.group_index > 0 { + state.group_index-- + state.pc = tmp_pc + m_state = .ist_quant_ng //.ist_next + continue + } + + result = regex.no_match_found + m_state = .stop + continue + } + + // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep") + // UNREACHABLE + return regex.err_internal_error, state.i + } + // ist_quant_pg => quantifier positive test on group + else if m_state == .ist_quant_pg { + // println(".ist_quant_pg") + mut tmp_pc := state.pc + if state.group_index >= 0 { + tmp_pc = re.group_data[state.group_index] + } + + rep := re.prog[tmp_pc].group_rep + + if rep < re.prog[tmp_pc].rep_min { + // println("ist_quant_pg UNDER RANGE") + state.pc = re.prog[tmp_pc].goto_pc + m_state = .ist_next + continue + } else if rep == re.prog[tmp_pc].rep_max { + // println("ist_quant_pg MAX RANGE") + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + state.group_index-- + m_state = .ist_next + + continue + } else if rep >= re.prog[tmp_pc].rep_min { + // println("ist_quant_pg IN RANGE group_index:$state.group_index") + + // check greedy flag, if true exit on minimum + if re.prog[tmp_pc].greedy == true { + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + state.group_index-- + m_state = .ist_next + continue + } + + state.pc = re.prog[tmp_pc].goto_pc - 1 + state.group_index-- + m_state = .ist_next + continue + } + + // UNREACHABLE + // println("PANIC3!! state: $m_state") + return regex.err_internal_error, state.i + } + // ist_quant_n => quantifier negative test on token + else if m_state == .ist_quant_n { + rep := re.prog[state.pc].rep + // println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}") + + // zero quantifier * or ? + if rep == 0 && re.prog[state.pc].rep_min == 0 { + // println("ist_quant_n c_zero RANGE MIN") + m_state = .ist_next // go to next ist + continue + } + // match + or * + else if rep >= re.prog[state.pc].rep_min { + // println("ist_quant_n MATCH RANGE") + m_state = .ist_next + continue + } + + // check the OR if present + if re.prog[state.pc].next_is_or { + // println("OR present on failing") + state.match_index = -1 + m_state = .ist_next + continue + } + + // we are in a group manage no match from here + if state.group_index >= 0 { + // println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index") + m_state = .ist_quant_ng + continue + } + + // no other options + // println("ist_quant_n no_match_found") + result = regex.no_match_found + m_state = .stop + continue + // return no_match_found, 0 + } + // ist_quant_p => quantifier positive test on token + else if m_state == .ist_quant_p { + // exit on first match + if (re.flag & regex.f_efm) != 0 { + return state.i, state.i + 1 + } + + rep := re.prog[state.pc].rep + + // under range + if rep > 0 && rep < re.prog[state.pc].rep_min { + // println("ist_quant_p UNDER RANGE") + m_state = .ist_load // continue the loop + continue + } + // range ok, continue loop + else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max { + // println("ist_quant_p IN RANGE") + + // check greedy flag, if true exit on minimum + if re.prog[state.pc].greedy == true { + m_state = .ist_next + continue + } + m_state = .ist_load + continue + } + // max reached + else if rep == re.prog[state.pc].rep_max { + // println("ist_quant_p MAX RANGE") + m_state = .ist_next + continue + } + } + // UNREACHABLE + // println("PANIC4!! state: $m_state") + return regex.err_internal_error, state.i + } + + // println("Check end of text!") + // Check the results + if state.match_index >= 0 { + if state.group_index < 0 { + if re.prog[state.pc].ist == regex.ist_prog_end { + // println("program ended!!") + + if (re.flag & regex.f_src) != 0 { + // println("find return") + return state.first_match, state.i + } else { + // println("Here!!") + return 0, state.i + } + } + + // println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len") + + if re.prog[state.pc + 1].ist == regex.ist_prog_end + || re.prog[state.pc].ist == regex.ist_prog_end { + rep := re.prog[state.pc].rep + // println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}") + if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max { + return state.first_match, state.i + } + // println("Program not finished! ") + return regex.no_match_found, 0 + } + if src_end { + // println("program end") + return state.first_match, state.i + } + // print("No match found!!") + return regex.no_match_found, 0 + } else { + // println("Group match! OK") + // println("first_match: $state.first_match, i: $state.i") + + // println("Skip last group") + return state.first_match, state.i + // return state.first_match,re.group_stack[state.group_index--] + } + } + // println("no_match_found, natural end") + return regex.no_match_found, 0 +} |