diff options
Diffstat (limited to 'v_windows/v/vlib/regex')
-rw-r--r-- | v_windows/v/vlib/regex/README.md | 874 | ||||
-rw-r--r-- | v_windows/v/vlib/regex/regex.v | 2324 | ||||
-rw-r--r-- | v_windows/v/vlib/regex/regex_opt.v | 53 | ||||
-rw-r--r-- | v_windows/v/vlib/regex/regex_test.v | 608 | ||||
-rw-r--r-- | v_windows/v/vlib/regex/regex_util.v | 436 |
5 files changed, 4295 insertions, 0 deletions
diff --git a/v_windows/v/vlib/regex/README.md b/v_windows/v/vlib/regex/README.md new file mode 100644 index 0000000..0faa833 --- /dev/null +++ b/v_windows/v/vlib/regex/README.md @@ -0,0 +1,874 @@ +# V RegEx (Regular expression) 1.0 alpha + +[TOC] + +## Introduction + +Here are the assumptions made during the writing of the implementation, that +are valid for all the `regex` module features: + +1. The matching stops at the end of the string, *not* at newline characters. + +2. The basic atomic elements of this regex engine are the tokens. +In a query string a simple character is a token. + + +## Differences with PCRE: + +NB: We must point out that the **V-Regex module is not PCRE compliant** and thus +some behaviour will be different. This difference is due to the V philosophy, +to have one way and keep it simple. + +The main differences can be summarized in the following points: + +- The basic element **is the token not the sequence of symbols**, and the most +simple token, is a single character. + +- `|` **the OR operator acts on tokens,** for example `abc|ebc` is not +`abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`, +followed by `bc`, because the **token is the base element**, +not the sequence of symbols. + +- The **match operation stops at the end of the string**. It does *NOT* stop +at new line characters. + + +## Tokens + +The tokens are the atomic units, used by this regex engine. +They can be one of the following: + + +### Simple char + +This token is a simple single character like `a` or `b` etc. + + +### Match positional delimiters + +`^` Matches the start of the string. + +`$` Matches the end of the string. + + +### Char class (cc) + +The character classes match all the chars specified inside. Use square +brackets `[ ]` to enclose them. + +The sequence of the chars in the character class, is evaluated with an OR op. + +For example, the cc `[abc]`, matches any character, that is `a` or `b` or `c`, +but it doesn't match `C` or `z`. + +Inside a cc, it is possible to specify a "range" of characters, for example +`[ad-h]` is equivalent to writing `[adefgh]`. + +A cc can have different ranges at the same time, for example `[a-zA-z0-9]` +matches all the latin lowercase, uppercase and numeric characters. + +It is possible to negate the meaning of a cc, using the caret char at the +start of the cc like this: `[^abc]` . That matches every char that is NOT +`a` or `b` or `c`. + +A cc can contain meta-chars like: `[a-z\d]`, that match all the lowercase +latin chars `a-z` and all the digits `\d`. + +It is possible to mix all the properties of the char class together. + +NB: In order to match the `-` (minus) char, it must be preceded by + a backslash in the cc, for example `[\-_\d\a]` will match: + `-` minus, + `_` underscore, + `\d` numeric chars, + `\a` lower case chars. + +### Meta-chars + +A meta-char is specified by a backslash, before a character. +For example `\w` is the meta-char `w`. + +A meta-char can match different types of characters. + +* `\w` matches a word char char `[a-zA-Z0-9_]` +* `\W` matches a non word char +* `\d` matches a digit `[0-9]` +* `\D` matches a non digit +* `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']` +* `\S` matches a non space char +* `\a` matches only a lowercase char `[a-z]` +* `\A` matches only an uppercase char `[A-Z]` + +### Quantifier + +Each token can have a quantifier, that specifies how many times the character +must be matched. + +#### **Short quantifiers** + +- `?` matches 0 or 1 time, `a?b` matches both `ab` or `b` +- `+` matches *at least* 1 time, for example, `a+` matches both `aaa` or `a` +- `*` matches 0 or more times, for example, `a*b` matches `aaab`, `ab` or `b` + +#### **Long quantifiers** + +- `{x}` matches exactly x times, `a{2}` matches `aa`, but not `aaa` or `a` +- `{min,}` matches at least min times, `a{2,}` matches `aaa` or `aa`, not `a` +- `{,max}` matches at least 0 times and at maximum max times, + for example, `a{,2}` matches `a` and `aa`, but doesn't match `aaa` +- `{min,max}` matches from min times, to max times, for example + `a{2,3}` matches `aa` and `aaa`, but doesn't match `a` or `aaaa` + +A long quantifier, may have a `greedy off` flag, that is the `?` +character after the brackets. `{2,4}?` means to match the minimum +number of possible tokens, in this case 2. + +### Dot char + +The dot is a particular meta-char, that matches "any char". + +It is simpler to explain it with an example: + +Suppose you have `abccc ddeef` as a source string, that you want to parse +with a regex. The following table show the query strings and the result of +parsing source string. + +| query string | result | +|--------------|-------------| +| `.*c` | `abc` | +| `.*dd` | `abcc dd` | +| `ab.*e` | `abccc dde` | +| `ab.{3} .*e` | `abccc dde` | +The dot matches any character, until the next token match is satisfied. + +**Important Note:** *Consecutive dots, for example `...`, are not allowed.* +*This will cause a syntax error. Use a quantifier instead.* + +### OR token + +The token `|`, means a logic OR operation between two consecutive tokens, +i.e. `a|b` matches a character that is `a` or `b`. + +The OR token can work in a "chained way": `a|(b)|cd ` means test first `a`, +if the char is not `a`, then test the group `(b)`, and if the group doesn't +match too, finally test the token `c`. + +NB: ** unlike in PCRE, the OR operation works at token level!** +It doesn't work at concatenation level! + +That also means, that a query string like `abc|bde` is not equal to +`(abc)|(bde)`, but instead to `ab(c|b)de. +The OR operation works only for `c|b`, not at char concatenation level. + +### Groups + +Groups are a method to create complex patterns with repetitions of blocks +of tokens. The groups are delimited by round brackets `( )`. Groups can be +nested. Like all other tokens, groups can have a quantifier too. + +`c(pa)+z` match `cpapaz` or `cpaz` or `cpapapaz` . + +`(c(pa)+z ?)+` matches `cpaz cpapaz cpapapaz` or `cpapaz` + +Lets analyze this last case, first we have the group `#0`, that is the most +outer round brackets `(...)+`. This group has a quantifier `+`, that say to +match its content *at least one time*. + +Then we have a simple char token `c`, and a second group `#1`: `(pa)+`. +This group also tries to match the sequence `pa`, *at least one time*, +as specified by the `+` quantifier. + +Then, we have another simple token `z` and another simple token ` ?`, +i.e. the space char (ascii code 32) followed by the `?` quantifier, +which means that the preceding space should be matched 0 or 1 time. + +This explains why the `(c(pa)+z ?)+` query string, +can match `cpaz cpapaz cpapapaz` . + +In this implementation the groups are "capture groups". This means that the +last temporal result for each group, can be retrieved from the `RE` struct. + +The "capture groups" are stored as indexes in the field `groups`, +that is an `[]int` inside the `RE` struct. + +**example:** + +```v oksyntax +text := 'cpaz cpapaz cpapapaz' +query := r'(c(pa)+z ?)+' +mut re := regex.regex_opt(query) or { panic(err) } +println(re.get_query()) +// #0(c#1(pa)+z ?)+ +// #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2 +start, end := re.match_string(text) +// [start=0, end=20] match => [cpaz cpapaz cpapapaz] +mut gi := 0 +for gi < re.groups.len { + if re.groups[gi] >= 0 { + println('${gi / 2} :[${text[re.groups[gi]..re.groups[gi + 1]]}]') + } + gi += 2 +} +// groups captured +// 0 :[cpapapaz] +// 1 :[pa] +``` + +**note:** *to show the `group id number` in the result of the `get_query()`* +*the flag `debug` of the RE object must be `1` or `2`* + +In order to simplify the use of the captured groups, it possible to use the +utility function: `get_group_list`. + +This function return a list of groups using this support struct: + +```v oksyntax +pub struct Re_group { +pub: + start int = -1 + end int = -1 +} +``` + +Here an example of use: + +```v oksyntax +/* +This simple function converts an HTML RGB value with 3 or 6 hex digits to +an u32 value, this function is not optimized and it is only for didatical +purpose. Example: #A0B0CC #A9F +*/ +fn convert_html_rgb(in_col string) u32 { + mut n_digit := if in_col.len == 4 { 1 } else { 2 } + mut col_mul := if in_col.len == 4 { 4 } else { 0 } + // this is the regex query, it use the V string interpolation to customize the regex query + // NOTE: if you want use escaped code you must use the r"" (raw) strings, + // *** please remember that the V interpoaltion doesn't work on raw strings. *** + query := '#([a-fA-F0-9]{$n_digit})([a-fA-F0-9]{$n_digit})([a-fA-F0-9]{$n_digit})' + mut re := regex.regex_opt(query) or { panic(err) } + start, end := re.match_string(in_col) + println('start: $start, end: $end') + mut res := u32(0) + if start >= 0 { + group_list := re.get_group_list() // this is the utility function + r := ('0x' + in_col[group_list[0].start..group_list[0].end]).int() << col_mul + g := ('0x' + in_col[group_list[1].start..group_list[1].end]).int() << col_mul + b := ('0x' + in_col[group_list[2].start..group_list[2].end]).int() << col_mul + println('r: $r g: $g b: $b') + res = u32(r) << 16 | u32(g) << 8 | u32(b) + } + return res +} +``` + +Others utility functions are `get_group_by_id` and `get_group_bounds_by_id` +that get directly the string of a group using its `id`: + +```v ignore +txt := "my used string...." +for g_index := 0; g_index < re.group_count ; g_index++ { + println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ + bounds: ${re.get_group_bounds_by_id(g_index)}") +} +``` + +More helper functions are listed in the **Groups query functions** section. + +### Groups Continuous saving + +In particular situations, it is useful to have a continuous group saving. +This is possible by initializing the `group_csave` field in the `RE` struct. + +This feature allows you to collect data in a continuous/streaming way. + +In the example, we can pass a text, followed by an integer list, +that we wish to collect. To achieve this task, we can use the continuous +group saving, by enabling the right flag: `re.group_csave_flag = true`. + +The `.group_csave` array will be filled then, following this logic: + +`re.group_csave[0]` - number of total saved records +`re.group_csave[1+n*3]` - id of the saved group +`re.group_csave[1+n*3]` - start index in the source string of the saved group +`re.group_csave[1+n*3]` - end index in the source string of the saved group + +The regex will save groups, until it finishes, or finds that the array has no +more space. If the space ends, no error is raised, and further records will +not be saved. + +```v ignore +import regex +fn main(){ + txt := "http://www.ciao.mondo/hello/pippo12_/pera.html" + query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+" + + mut re := regex.regex_opt(query) or { panic(err) } + //println(re.get_code()) // uncomment to see the print of the regex execution code + re.debug=2 // enable maximum log + println("String: ${txt}") + println("Query : ${re.get_query()}") + re.debug=0 // disable log + re.group_csave_flag = true + start, end := re.match_string(txt) + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + } else { + println("No Match") + } + + if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{ + println("cg: $re.group_csave") + mut cs_i := 1 + for cs_i < re.group_csave[0]*3 { + g_id := re.group_csave[cs_i] + st := re.group_csave[cs_i+1] + en := re.group_csave[cs_i+2] + println("cg[$g_id] $st $en:[${txt[st..en]}]") + cs_i += 3 + } + } +} +``` + +The output will be: + +``` +String: http://www.ciao.mondo/hello/pippo12_/pera.html +Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+ +Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html] +cg: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46] +cg[0] 0 4:[http] +cg[1] 7 11:[www.] +cg[1] 11 16:[ciao.] +cg[1] 16 22:[mondo/] +cg[1] 22 28:[hello/] +cg[1] 28 37:[pippo12_/] +cg[1] 37 42:[pera.] +cg[1] 42 46:[html] +``` + +### Named capturing groups + +This regex module supports partially the question mark `?` PCRE syntax for groups. + +`(?:abcd)` **non capturing group**: the content of the group will not be saved. + +`(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled +as `mygroup`. + +The label of the groups is saved in the `group_map` of the `RE` struct, +that is a map from `string` to `int`, where the value is the index in +`group_csave` list of indexes. + +Here is an example for how to use them: +```v ignore +import regex +fn main(){ + txt := "http://www.ciao.mondo/hello/pippo12_/pera.html" + query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+" + + mut re := regex.regex_opt(query) or { panic(err) } + //println(re.get_code()) // uncomment to see the print of the regex execution code + re.debug=2 // enable maximum log + println("String: ${txt}") + println("Query : ${re.get_query()}") + re.debug=0 // disable log + start, end := re.match_string(txt) + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + } else { + println("No Match") + } + + for name in re.group_map.keys() { + println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \ + bounds: ${re.get_group_bounds_by_name(name)}") + } +} +``` + +Output: + +``` +String: http://www.ciao.mondo/hello/pippo12_/pera.html +Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+ +Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html] +group:'format' => [http] bounds: (0, 4) +group:'token' => [html] bounds: (42, 46) +``` + +In order to simplify the use of the named groups, it is possible to +use a name map in the `re` struct, using the function `re.get_group_by_name`. + +Here is a more complex example of using them: +```v oksyntax +// This function demostrate the use of the named groups +fn convert_html_rgb_n(in_col string) u32 { + mut n_digit := if in_col.len == 4 { 1 } else { 2 } + mut col_mul := if in_col.len == 4 { 4 } else { 0 } + query := '#(?P<red>[a-fA-F0-9]{$n_digit})' + '(?P<green>[a-fA-F0-9]{$n_digit})' + + '(?P<blue>[a-fA-F0-9]{$n_digit})' + mut re := regex.regex_opt(query) or { panic(err) } + start, end := re.match_string(in_col) + println('start: $start, end: $end') + mut res := u32(0) + if start >= 0 { + red_s, red_e := re.get_group_by_name('red') + r := ('0x' + in_col[red_s..red_e]).int() << col_mul + green_s, green_e := re.get_group_by_name('green') + g := ('0x' + in_col[green_s..green_e]).int() << col_mul + blue_s, blue_e := re.get_group_by_name('blue') + b := ('0x' + in_col[blue_s..blue_e]).int() << col_mul + println('r: $r g: $g b: $b') + res = u32(r) << 16 | u32(g) << 8 | u32(b) + } + return res +} +``` + +Other utilities are `get_group_by_name` and `get_group_bounds_by_name`, +that return the string of a group using its `name`: + +```v ignore +txt := "my used string...." +for name in re.group_map.keys() { + println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \ + bounds: ${re.get_group_bounds_by_name(name)}") +} +``` + + + +### Groups query functions + +These functions are helpers to query the captured groups + +```v ignore +// get_group_bounds_by_name get a group boundaries by its name +pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) + +// get_group_by_name get a group string by its name +pub fn (re RE) get_group_by_name(group_name string) string + +// get_group_by_id get a group boundaries by its id +pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) + +// get_group_by_id get a group string by its id +pub fn (re RE) get_group_by_id(in_txt string, group_id int) string + +struct Re_group { +pub: + start int = -1 + end int = -1 +} + +// get_group_list return a list of Re_group for the found groups +pub fn (re RE) get_group_list() []Re_group +``` + +## Flags + +It is possible to set some flags in the regex parser, that change +the behavior of the parser itself. + +```v ignore +// example of flag settings +mut re := regex.new() +re.flag = regex.F_BIN +``` + +- `F_BIN`: parse a string as bytes, utf-8 management disabled. + +- `F_EFM`: exit on the first char matches in the query, used by the + find function. + +- `F_MS`: matches only if the index of the start match is 0, + same as `^` at the start of the query string. + +- `F_ME`: matches only if the end index of the match is the last char + of the input string, same as `$` end of query string. + +- `F_NL`: stop the matching if found a new line char `\n` or `\r` + +## Functions + +### Initializer + +These functions are helper that create the `RE` struct, +a `RE` struct can be created manually if you needed. + +#### **Simplified initializer** + +```v ignore +// regex create a regex object from the query string and compile it +pub fn regex_opt(in_query string) ?RE +``` + +#### **Base initializer** + +```v ignore +// new_regex create a REgex of small size, usually sufficient for ordinary use +pub fn new() RE + +``` +#### **Custom initialization** +For some particular needs, it is possible to initialize a fully customized regex: +```v ignore +pattern = r"ab(.*)(ac)" +// init custom regex +mut re := regex.RE{} +// max program length, can not be longer then the pattern +re.prog = []Token {len: pattern.len + 1} +// can not be more char class the the length of the pattern +re.cc = []CharClass{len: pattern.len} + +re.group_csave_flag = false // true enable continuos group saving if needed +re.group_max_nested = 128 // set max 128 group nested possible +re.group_max = pattern.len>>1 // we can't have more groups than the half of the pattern legth +re.group_stack = []int{len: re.group_max, init: -1} +re.group_data = []int{len: re.group_max, init: -1} +``` +### Compiling + +After an initializer is used, the regex expression must be compiled with: + +```v ignore +// compile compiles the REgex returning an error if the compilation fails +pub fn (re mut RE) compile_opt(in_txt string) ? +``` + +### Matching Functions + +These are the matching functions + +```v ignore +// match_string try to match the input string, return start and end index if found else start is -1 +pub fn (re mut RE) match_string(in_txt string) (int,int) + +``` + +## Find and Replace + +There are the following find and replace functions: + +#### Find functions + +```v ignore +// find try to find the first match in the input string +// return start and end index if found else start is -1 +pub fn (re mut RE) find(in_txt string) (int,int) + +// find_all find all the "non overlapping" occurrences of the matching pattern +// return a list of start end indexes like: [3,4,6,8] +// the matches are [3,4] and [6,8] +pub fn (re mut RE) find_all(in_txt string) []int + +// find_all find all the "non overlapping" occurrences of the matching pattern +// return a list of strings +// the result is like ["first match","secon match"] +pub fn (mut re RE) find_all_str(in_txt string) []string +``` + +#### Replace functions + +```v ignore +// replace return a string where the matches are replaced with the repl_str string, +// this function support groups in the replace string +pub fn (re mut RE) replace(in_txt string, repl string) string +``` + +replace string can include groups references: + +```v ignore +txt := "Today it is a good day." +query := r'(a\w)[ ,.]' +mut re := regex.regex_opt(query)? +res := re.replace(txt, r"__[\0]__") +``` + +in this example we used the group `0` in the replace string: `\0`, the result will be: + +``` +Today it is a good day. => Tod__[ay]__it is a good d__[ay]__ +``` + +**Note:** in the replace strings can be used only groups from `0` to `9`. + +If the usage of `groups` in the replace process, is not needed, it is possible +to use a quick function: + +```v ignore +// replace_simple return a string where the matches are replaced with the replace string +pub fn (mut re RE) replace_simple(in_txt string, repl string) string +``` + +#### Custom replace function + +For complex find and replace operations, you can use `replace_by_fn` . +The `replace_by_fn`, uses a custom replace callback function, thus +allowing customizations. The custom callback function is called for +every non overlapped find. + +The custom callback function must be of the type: + +```v ignore +// type of function used for custom replace +// in_txt source text +// start index of the start of the match in in_txt +// end index of the end of the match in in_txt +// --- the match is in in_txt[start..end] --- +fn (re RE, in_txt string, start int, end int) string +``` + +The following example will clarify its usage: + +```v ignore +import regex +// customized replace functions +// it will be called on each non overlapped find +fn my_repl(re regex.RE, in_txt string, start int, end int) string { + g0 := re.get_group_by_id(in_txt, 0) + g1 := re.get_group_by_id(in_txt, 1) + g2 := re.get_group_by_id(in_txt, 2) + return "*$g0*$g1*$g2*" +} + +fn main(){ + txt := "today [John] is gone to his house with (Jack) and [Marie]." + query := r"(.)(\A\w+)(.)" + + mut re := regex.regex_opt(query) or { panic(err) } + + result := re.replace_by_fn(txt, my_repl) + println(result) +} +``` + +Output: + +``` +today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*. +``` + + + +## Debugging + +This module has few small utilities to you write regex patterns. + +### **Syntax errors highlight** + +The next example code shows how to visualize regex pattern syntax errors +in the compilation phase: + +```v oksyntax +query := r'ciao da ab[ab-]' +// there is an error, a range not closed!! +mut re := new() +re.compile_opt(query) or { println(err) } +// output!! +// query: ciao da ab[ab-] +// err : ----------^ +// ERROR: ERR_SYNTAX_ERROR +``` + +### **Compiled code** + +It is possible to view the compiled code calling the function `get_query()`. +The result will be something like this: + +``` +======================================== +v RegEx compiler v 1.0 alpha output: +PC: 0 ist: 92000000 ( GROUP_START #:0 { 1, 1} +PC: 1 ist: 98000000 . DOT_CHAR nx chk: 4 { 1, 1} +PC: 2 ist: 94000000 ) GROUP_END #:0 { 1, 1} +PC: 3 ist: 92000000 ( GROUP_START #:1 { 1, 1} +PC: 4 ist: 90000000 [\A] BSLS { 1, 1} +PC: 5 ist: 90000000 [\w] BSLS { 1,MAX} +PC: 6 ist: 94000000 ) GROUP_END #:1 { 1, 1} +PC: 7 ist: 92000000 ( GROUP_START #:2 { 1, 1} +PC: 8 ist: 98000000 . DOT_CHAR nx chk: -1 last! { 1, 1} +PC: 9 ist: 94000000 ) GROUP_END #:2 { 1, 1} +PC: 10 ist: 88000000 PROG_END { 0, 0} +======================================== + +``` + +`PC`:`int` is the program counter or step of execution, each single step is a token. + +`ist`:`hex` is the token instruction id. + +`[a]` is the char used by the token. + +`query_ch` is the type of token. + +`{m,n}` is the quantifier, the greedy off flag `?` will be showed if present in the token + +### **Log debug** + +The log debugger allow to print the status of the regex parser when the +parser is running. It is possible to have two different levels of +debug information: 1 is normal, while 2 is verbose. + +Here is an example: + +*normal* - list only the token instruction with their values + +```ignore +// re.flag = 1 // log level normal +flags: 00000000 +# 2 s: ist_load PC: i,ch,len:[ 0,'a',1] f.m:[ -1, -1] query_ch: [a]{1,1}:0 (#-1) +# 5 s: ist_load PC: i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [b]{2,3}:0? (#-1) +# 7 s: ist_load PC: i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1) +# 10 PROG_END +``` + +*verbose* - list all the instructions and states of the parser + +```ignore +flags: 00000000 +# 0 s: start PC: NA +# 1 s: ist_next PC: NA +# 2 s: ist_load PC: i,ch,len:[ 0,'a',1] f.m:[ -1, -1] query_ch: [a]{1,1}:0 (#-1) +# 3 s: ist_quant_p PC: i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [a]{1,1}:1 (#-1) +# 4 s: ist_next PC: NA +# 5 s: ist_load PC: i,ch,len:[ 1,'b',1] f.m:[ 0, 0] query_ch: [b]{2,3}:0? (#-1) +# 6 s: ist_quant_p PC: i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1) +# 7 s: ist_load PC: i,ch,len:[ 2,'b',1] f.m:[ 0, 1] query_ch: [b]{2,3}:1? (#-1) +# 8 s: ist_quant_p PC: i,ch,len:[ 3,'b',1] f.m:[ 0, 2] query_ch: [b]{2,3}:2? (#-1) +# 9 s: ist_next PC: NA +# 10 PROG_END +# 11 PROG_END +``` + +the columns have the following meaning: + +`# 2` number of actual steps from the start of parsing + +`s: ist_next` state of the present step + +`PC: 1` program counter of the step + +`=>7fffffff ` hex code of the instruction + +`i,ch,len:[ 0,'a',1]` `i` index in the source string, `ch` the char parsed, +`len` the length in byte of the char parsed + +`f.m:[ 0, 1]` `f` index of the first match in the source string, `m` index that is actual matching + +`query_ch: [b]` token in use and its char + +`{2,3}:1?` quantifier `{min,max}`, `:1` is the actual counter of repetition, +`?` is the greedy off flag if present. + +### **Custom Logger output** + +The debug functions output uses the `stdout` as default, +it is possible to provide an alternative output, by setting a custom +output function: + +```v oksyntax +// custom print function, the input will be the regex debug string +fn custom_print(txt string) { + println('my log: $txt') +} + +mut re := new() +re.log_func = custom_print +// every debug output from now will call this function +``` + +## Example code + +Here an example that perform some basically match of strings + +```v ignore +import regex + +fn main(){ + txt := "http://www.ciao.mondo/hello/pippo12_/pera.html" + query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+" + + mut re := regex.regex_opt(query) or { panic(err) } + + start, end := re.match_string(txt) + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + for g_index := 0; g_index < re.group_count ; g_index++ { + println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ + bounds: ${re.get_group_bounds_by_id(g_index)}") + } + for name in re.group_map.keys() { + println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \ + bounds: ${re.get_group_bounds_by_name(name)}") + } + } else { + println("No Match") + } +} +``` +Here an example of total customization of the regex environment creation: +```v ignore +import regex + +fn main(){ + txt := "today John is gone to his house with Jack and Marie." + query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+" + + // init regex + mut re := regex.RE{} + // max program length, can not be longer then the query + re.prog = []regex.Token {len: query.len + 1} + // can not be more char class the the length of the query + re.cc = []regex.CharClass{len: query.len} + re.prog = []regex.Token {len: query.len+1} + // enable continuos group saving + re.group_csave_flag = true + // set max 128 group nested + re.group_max_nested = 128 + // we can't have more groups than the half of the query legth + re.group_max = query.len>>1 + + // compile the query + re.compile_opt(query) or { panic(err) } + + start, end := re.match_string(txt) + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + } else { + println("No Match") + } + + // show results for continuos group saving + if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{ + println("cg: $re.group_csave") + mut cs_i := 1 + for cs_i < re.group_csave[0]*3 { + g_id := re.group_csave[cs_i] + st := re.group_csave[cs_i+1] + en := re.group_csave[cs_i+2] + println("cg[$g_id] $st $en:[${txt[st..en]}]") + cs_i += 3 + } + } + + // show results for captured groups + if start >= 0 { + println("Match ($start, $end) => [${txt[start..end]}]") + for g_index := 0; g_index < re.group_count ; g_index++ { + println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ + bounds: ${re.get_group_bounds_by_id(g_index)}") + } + for name in re.group_map.keys() { + println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \ + bounds: ${re.get_group_bounds_by_name(name)}") + } + } else { + println("No Match") + } +} +``` + +More examples are available in the test code for the `regex` module, +see `vlib/regex/regex_test.v`. diff --git a/v_windows/v/vlib/regex/regex.v b/v_windows/v/vlib/regex/regex.v new file mode 100644 index 0000000..9e630e1 --- /dev/null +++ b/v_windows/v/vlib/regex/regex.v @@ -0,0 +1,2324 @@ +/* +regex 1.0 alpha + +Copyright (c) 2019-2021 Dario Deledda. All rights reserved. +Use of this source code is governed by an MIT license +that can be found in the LICENSE file. + +This file contains regex module + +Know limitation: +- find is implemented in a trivial way +- not full compliant PCRE +- not compliant POSIX ERE +*/ +module regex + +import strings + +pub const ( + v_regex_version = '1.0 alpha' // regex module version + + max_code_len = 256 // default small base code len for the regex programs + max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 + // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode + spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] + // new line chars for now only '\n' + new_line_list = [`\n`, `\r`] + + // Results + no_match_found = -1 + + // Errors + compile_ok = 0 // the regex string compiled, all ok + err_char_unknown = -2 // the char used is unknow to the system + err_undefined = -3 // the compiler symbol is undefined + err_internal_error = -4 // Bug in the regex system!! + err_cc_alloc_overflow = -5 // memory for char class full!! + err_syntax_error = -6 // syntax error in regex compiling + err_groups_overflow = -7 // max number of groups reached + err_groups_max_nested = -8 // max number of nested group reached + err_group_not_balanced = -9 // group not balanced + err_group_qm_notation = -10 // group invalid notation +) + +const ( + //************************************* + // regex program instructions + //************************************* + ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char + // char class 11 0100 AA xxxxxxxx + // AA = 00 regular class + // AA = 01 Negated class ^ char + ist_char_class = 0xD1000000 // MASK + ist_char_class_pos = 0xD0000000 // char class normal [abc] + ist_char_class_neg = 0xD1000000 // char class negate [^abc] + // dot char 10 0110 xx xxxxxxxx + ist_dot_char = 0x98000000 // match any char except \n + // backslash chars 10 0100 xx xxxxxxxx + ist_bsls_char = 0x90000000 // backslash char + // OR | 10 010Y xx xxxxxxxx + ist_or_branch = 0x91000000 // OR case + // groups 10 010Y xx xxxxxxxx + ist_group_start = 0x92000000 // group start ( + ist_group_end = 0x94000000 // group end ) + // control instructions + ist_prog_end = u32(0x88000000) // 10 0010 xx xxxxxxxx + //************************************* +) + +/* +General Utilities +*/ +// utf8util_char_len calculate the length in bytes of a utf8 char +[inline] +fn utf8util_char_len(b byte) int { + return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1 +} + +// get_char get a char from position i and return an u32 with the unicode code +[direct_array_access; inline] +fn (re RE) get_char(in_txt string, i int) (u32, int) { + ini := unsafe { in_txt.str[i] } + // ascii 8 bit + if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 { + return u32(ini), 1 + } + // unicode char + char_len := utf8util_char_len(ini) + mut tmp := 0 + mut ch := u32(0) + for tmp < char_len { + ch = (ch << 8) | unsafe { in_txt.str[i + tmp] } + tmp++ + } + return ch, char_len +} + +// get_charb get a char from position i and return an u32 with the unicode code +[direct_array_access; inline] +fn (re RE) get_charb(in_txt &byte, i int) (u32, int) { + // ascii 8 bit + if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 { + return u32(unsafe { in_txt[i] }), 1 + } + // unicode char + char_len := utf8util_char_len(unsafe { in_txt[i] }) + mut tmp := 0 + mut ch := u32(0) + for tmp < char_len { + ch = (ch << 8) | unsafe { in_txt[i + tmp] } + tmp++ + } + return ch, char_len +} + +[inline] +fn is_alnum(in_char byte) bool { + mut tmp := in_char - `A` + if tmp <= 25 { + return true + } + tmp = in_char - `a` + if tmp <= 25 { + return true + } + tmp = in_char - `0` + if tmp <= 9 { + return true + } + if in_char == `_` { + return true + } + return false +} + +[inline] +fn is_not_alnum(in_char byte) bool { + return !is_alnum(in_char) +} + +[inline] +fn is_space(in_char byte) bool { + return in_char in regex.spaces +} + +[inline] +fn is_not_space(in_char byte) bool { + return !is_space(in_char) +} + +[inline] +fn is_digit(in_char byte) bool { + tmp := in_char - `0` + return tmp <= 0x09 +} + +[inline] +fn is_not_digit(in_char byte) bool { + return !is_digit(in_char) +} + +/* +[inline] +fn is_wordchar(in_char byte) bool { + return is_alnum(in_char) || in_char == `_` +} + +[inline] +fn is_not_wordchar(in_char byte) bool { + return !is_alnum(in_char) +} +*/ + +[inline] +fn is_lower(in_char byte) bool { + tmp := in_char - `a` + return tmp <= 25 +} + +[inline] +fn is_upper(in_char byte) bool { + tmp := in_char - `A` + return tmp <= 25 +} + +pub fn (re RE) get_parse_error_string(err int) string { + match err { + regex.compile_ok { return 'compile_ok' } + regex.no_match_found { return 'no_match_found' } + regex.err_char_unknown { return 'err_char_unknown' } + regex.err_undefined { return 'err_undefined' } + regex.err_internal_error { return 'err_internal_error' } + regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' } + regex.err_syntax_error { return 'err_syntax_error' } + regex.err_groups_overflow { return 'err_groups_overflow' } + regex.err_groups_max_nested { return 'err_groups_max_nested' } + regex.err_group_not_balanced { return 'err_group_not_balanced' } + regex.err_group_qm_notation { return 'err_group_qm_notation' } + else { return 'err_unknown' } + } +} + +// utf8_str convert and utf8 sequence to a printable string +[inline] +fn utf8_str(ch rune) string { + mut i := 4 + mut res := '' + for i > 0 { + v := byte((ch >> ((i - 1) * 8)) & 0xFF) + if v != 0 { + res += '${v:1c}' + } + i-- + } + return res +} + +// simple_log default log function +fn simple_log(txt string) { + print(txt) +} + +/****************************************************************************** +* +* Token Structs +* +******************************************************************************/ +pub type FnValidator = fn (byte) bool + +struct Token { +mut: + ist rune + // char + ch rune // char of the token if any + ch_len byte // char len + // Quantifiers / branch + rep_min int // used also for jump next in the OR branch [no match] pc jump + rep_max int // used also for jump next in the OR branch [ match] pc jump + greedy bool // greedy quantifier flag + // Char class + cc_index int = -1 + // counters for quantifier check (repetitions) + rep int + // validator function pointer + validator FnValidator + // groups variables + group_rep int // repetition of the group + group_id int = -1 // id of the group + goto_pc int = -1 // jump to this PC if is needed + // OR flag for the token + next_is_or bool // true if the next token is an OR + // dot_char token variables + dot_check_pc int = -1 // pc of the next token to check + last_dot_flag bool // if true indicate that is the last dot_char in the regex +} + +[inline] +fn (mut tok Token) reset() { + tok.rep = 0 +} + +/****************************************************************************** +* +* Regex struct +* +******************************************************************************/ +pub const ( + f_nl = 0x00000001 // end the match when find a new line symbol + f_ms = 0x00000002 // match true only if the match is at the start of the string + f_me = 0x00000004 // match true only if the match is at the end of the string + + f_efm = 0x00000100 // exit on first token matched, used by search + f_bin = 0x00000200 // work only on bytes, ignore utf-8 + // behaviour modifier flags + f_src = 0x00020000 // search mode enabled +) + +struct StateDotObj { +mut: + i int = -1 // char index in the input buffer + pc int = -1 // program counter saved + mi int = -1 // match_index saved + group_stack_index int = -1 // continuous save on capturing groups +} + +pub type FnLog = fn (string) + +pub struct RE { +pub mut: + prog []Token + prog_len int // regex program len + // char classes storage + cc []CharClass // char class list + cc_index int // index + // groups + group_count int // number of groups in this regex struct + groups []int // groups index results + group_max_nested int = 3 // max nested group + group_max int = 8 // max allowed number of different groups + + state_list []StateObj + + group_csave_flag bool // flag to enable continuous saving + group_csave []int //= []int{} // groups continuous save list + + group_map map[string]int // groups names map + + group_stack []int + group_data []int + // flags + flag int // flag for optional parameters + // Debug/log + debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE + log_func FnLog = simple_log // log function, can be customized by the user + query string // query string +} + +// Reset RE object +[direct_array_access; inline] +fn (mut re RE) reset() { + re.cc_index = 0 + + mut i := 0 + for i < re.prog_len { + re.prog[i].group_rep = 0 // clear repetition of the group + re.prog[i].rep = 0 // clear repetition of the token + i++ + } + + // init groups array + if re.group_count > 0 { + re.groups = []int{len: re.group_count * 2, init: -1} + } + + // reset group_csave + if re.group_csave_flag == true { + re.group_csave.clear() // = []int{} + } + + // reset state list + re.state_list.clear() + re.group_stack.clear() +} + +// reset for search mode fail +// gcc bug, dont use [inline] or go 5 time slower +//[inline] +[direct_array_access] +fn (mut re RE) reset_src() { + mut i := 0 + for i < re.prog_len { + re.prog[i].group_rep = 0 // clear repetition of the group + re.prog[i].rep = 0 // clear repetition of the token + i++ + } +} + +/****************************************************************************** +* +* Backslashes chars +* +******************************************************************************/ +struct BslsStruct { + ch rune // meta char + validator FnValidator // validator function pointer +} + +const ( + bsls_validator_array = [ + BslsStruct{`w`, is_alnum}, + BslsStruct{`W`, is_not_alnum}, + BslsStruct{`s`, is_space}, + BslsStruct{`S`, is_not_space}, + BslsStruct{`d`, is_digit}, + BslsStruct{`D`, is_not_digit}, + BslsStruct{`a`, is_lower}, + BslsStruct{`A`, is_upper}, + ] + + // these chars are escape if preceded by a \ + bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`, + `^`, `!`] +) + +enum BSLS_parse_state { + start + bsls_found + bsls_char + normal_char +} + +// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present +fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) { + mut status := BSLS_parse_state.start + mut i := in_i + + for i < in_txt.len { + // get our char + char_tmp, char_len := re.get_char(in_txt, i) + ch := byte(char_tmp) + + if status == .start && ch == `\\` { + status = .bsls_found + i += char_len + continue + } + + // check if is our bsls char, for now only one length sequence + if status == .bsls_found { + for c, x in regex.bsls_validator_array { + if x.ch == ch { + return c, i - in_i + 1 + } + } + status = .normal_char + continue + } + + // no BSLS validator, manage as normal escape char char + if status == .normal_char { + if ch in regex.bsls_escape_list { + return regex.no_match_found, i - in_i + 1 + } + return regex.err_syntax_error, i - in_i + 1 + } + + // at the present time we manage only one char after the \ + break + } + // not our bsls return KO + return regex.err_syntax_error, i +} + +/****************************************************************************** +* +* Char class +* +******************************************************************************/ +const ( + cc_null = 0 // empty cc token + cc_char = 1 // simple char: a + cc_int = 2 // char interval: a-z + cc_bsls = 3 // backslash char + cc_end = 4 // cc sequence terminator +) + +struct CharClass { +mut: + cc_type int = regex.cc_null // type of cc token + ch0 rune // first char of the interval a-b a in this case + ch1 rune // second char of the interval a-b b in this case + validator FnValidator // validator function pointer +} + +enum CharClass_parse_state { + start + in_char + in_bsls + separator + finish +} + +fn (re RE) get_char_class(pc int) string { + buf := []byte{len: (re.cc.len)} + mut buf_ptr := unsafe { &byte(&buf) } + + mut cc_i := re.prog[pc].cc_index + mut i := 0 + mut tmp := 0 + for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end { + if re.cc[cc_i].cc_type == regex.cc_bsls { + unsafe { + buf_ptr[i] = `\\` + i++ + buf_ptr[i] = byte(re.cc[cc_i].ch0) + i++ + } + } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 { + tmp = 3 + for tmp >= 0 { + x := byte((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) + if x != 0 { + unsafe { + buf_ptr[i] = x + i++ + } + } + tmp-- + } + } else { + tmp = 3 + for tmp >= 0 { + x := byte((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) + if x != 0 { + unsafe { + buf_ptr[i] = x + i++ + } + } + tmp-- + } + unsafe { + buf_ptr[i] = `-` + i++ + } + tmp = 3 + for tmp >= 0 { + x := byte((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF) + if x != 0 { + unsafe { + buf_ptr[i] = x + i++ + } + } + tmp-- + } + } + cc_i++ + } + unsafe { + buf_ptr[i] = byte(0) + } + return unsafe { tos_clone(buf_ptr) } +} + +fn (re RE) check_char_class(pc int, ch rune) bool { + mut cc_i := re.prog[pc].cc_index + for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end { + if re.cc[cc_i].cc_type == regex.cc_bsls { + if re.cc[cc_i].validator(byte(ch)) { + return true + } + } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 { + return true + } + cc_i++ + } + return false +} + +// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char +fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { + mut status := CharClass_parse_state.start + mut i := in_i + + mut tmp_index := re.cc_index + res_index := re.cc_index + + mut cc_type := u32(regex.ist_char_class_pos) + + for i < in_txt.len { + // check if we are out of memory for char classes + if tmp_index >= re.cc.len { + return regex.err_cc_alloc_overflow, 0, u32(0) + } + + // get our char + char_tmp, char_len := re.get_char(in_txt, i) + ch := byte(char_tmp) + + // println("CC #${i:3d} ch: ${ch:c}") + + // negation + if status == .start && ch == `^` { + cc_type = u32(regex.ist_char_class_neg) + i += char_len + continue + } + + // minus symbol + if status == .start && ch == `-` { + re.cc[tmp_index].cc_type = regex.cc_char + re.cc[tmp_index].ch0 = char_tmp + re.cc[tmp_index].ch1 = char_tmp + i += char_len + tmp_index++ + continue + } + + // bsls + if (status == .start || status == .in_char) && ch == `\\` { + // println("CC bsls.") + status = .in_bsls + i += char_len + continue + } + + if status == .in_bsls { + // println("CC bsls validation.") + for c, x in regex.bsls_validator_array { + if x.ch == ch { + // println("CC bsls found [${ch:c}]") + re.cc[tmp_index].cc_type = regex.cc_bsls + re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch + re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch + re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator + i += char_len + tmp_index++ + status = .in_char + break + } + } + if status == .in_bsls { + // manage as a simple char + // println("CC bsls not found [${ch:c}]") + re.cc[tmp_index].cc_type = regex.cc_char + re.cc[tmp_index].ch0 = char_tmp + re.cc[tmp_index].ch1 = char_tmp + i += char_len + tmp_index++ + status = .in_char + continue + } else { + continue + } + } + + // simple char + if (status == .start || status == .in_char) && ch != `-` && ch != `]` { + status = .in_char + + re.cc[tmp_index].cc_type = regex.cc_char + re.cc[tmp_index].ch0 = char_tmp + re.cc[tmp_index].ch1 = char_tmp + + i += char_len + tmp_index++ + continue + } + + // check range separator + if status == .in_char && ch == `-` { + status = .separator + i += char_len + continue + } + + // check range end + if status == .separator && ch != `]` && ch != `-` { + status = .in_char + re.cc[tmp_index - 1].cc_type = regex.cc_int + re.cc[tmp_index - 1].ch1 = char_tmp + i += char_len + continue + } + + // char class end + if status == .in_char && ch == `]` { + re.cc[tmp_index].cc_type = regex.cc_end + re.cc[tmp_index].ch0 = 0 + re.cc[tmp_index].ch1 = 0 + re.cc_index = tmp_index + 1 + + return res_index, i - in_i + 2, cc_type + } + + i++ + } + return regex.err_syntax_error, 0, u32(0) +} + +/****************************************************************************** +* +* Re Compiler +* +******************************************************************************/ +// +// Quantifier +// +enum Quant_parse_state { + start + min_parse + comma_checked + max_parse + greedy + gredy_parse + finish +} + +// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char +fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { + mut status := Quant_parse_state.start + mut i := in_i + + mut q_min := 0 // default min in a {} quantifier is 1 + mut q_max := 0 // deafult max in a {} quantifier is max_quantifier + + mut ch := byte(0) + + for i < in_txt.len { + unsafe { + ch = in_txt.str[i] + } + // println("${ch:c} status: $status") + + // exit on no compatible char with {} quantifier + if utf8util_char_len(ch) != 1 { + return regex.err_syntax_error, i, 0, false + } + + // min parsing skip if comma present + if status == .start && ch == `,` { + q_min = 0 // default min in a {} quantifier is 0 + status = .comma_checked + i++ + continue + } + + if status == .start && is_digit(ch) { + status = .min_parse + q_min *= 10 + q_min += int(ch - `0`) + i++ + continue + } + + if status == .min_parse && is_digit(ch) { + q_min *= 10 + q_min += int(ch - `0`) + i++ + continue + } + + // we have parsed the min, now check the max + if status == .min_parse && ch == `,` { + status = .comma_checked + i++ + continue + } + + // single value {4} + if status == .min_parse && ch == `}` { + q_max = q_min + status = .greedy + continue + } + + // end without max + if status == .comma_checked && ch == `}` { + q_max = regex.max_quantifier + status = .greedy + continue + } + + // start max parsing + if status == .comma_checked && is_digit(ch) { + status = .max_parse + q_max *= 10 + q_max += int(ch - `0`) + i++ + continue + } + + // parse the max + if status == .max_parse && is_digit(ch) { + q_max *= 10 + q_max += int(ch - `0`) + i++ + continue + } + + // finished the quantifier + if status == .max_parse && ch == `}` { + status = .greedy + continue + } + + // check if greedy flag char ? is present + if status == .greedy { + if i + 1 < in_txt.len { + i++ + status = .gredy_parse + continue + } + return q_min, q_max, i - in_i + 2, false + } + + // check the greedy flag + if status == .gredy_parse { + if ch == `?` { + return q_min, q_max, i - in_i + 2, true + } else { + i-- + return q_min, q_max, i - in_i + 2, false + } + } + + // not a {} quantifier, exit + return regex.err_syntax_error, i, 0, false + } + + // not a conform {} quantifier + return regex.err_syntax_error, i, 0, false +} + +// +// Groups +// +enum Group_parse_state { + start + q_mark // (? + q_mark1 // (?:|P checking + p_status // (?P + p_start // (?P< + p_end // (?P<...> + p_in_name // (?P<... + finish +} + +// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index) +fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) { + mut status := Group_parse_state.start + mut i := in_i + mut name := '' + + for i < in_txt.len && status != .finish { + // get our char + char_tmp, char_len := re.get_char(in_txt, i) + ch := byte(char_tmp) + + // start + if status == .start && ch == `(` { + status = .q_mark + i += char_len + continue + } + + // check for question marks + if status == .q_mark && ch == `?` { + status = .q_mark1 + i += char_len + continue + } + + // non capturing group + if status == .q_mark1 && ch == `:` { + i += char_len + return 0, false, name, i + } + + // enter in P section + if status == .q_mark1 && ch == `P` { + status = .p_status + i += char_len + continue + } + + // not a valid q mark found + if status == .q_mark1 { + // println("NO VALID Q MARK") + return -2, true, name, i + } + + if status == .p_status && ch == `<` { + status = .p_start + i += char_len + continue + } + + if status == .p_start && ch != `>` { + status = .p_in_name + name += '${ch:1c}' // TODO: manage utf8 chars + i += char_len + continue + } + + // colect name + if status == .p_in_name && ch != `>` && is_alnum(ch) { + name += '${ch:1c}' // TODO: manage utf8 chars + i += char_len + continue + } + + // end name + if status == .p_in_name && ch == `>` { + i += char_len + return 0, true, name, i + } + + // error on name group + if status == .p_in_name { + return -2, true, name, i + } + + // normal group, nothig to do, exit + return 0, true, name, i + } + // UNREACHABLE + // println("ERROR!! NOT MEANT TO BE HERE!!1") + return -2, true, name, i +} + +// +// main compiler +// +// compile return (return code, index) where index is the index of the error in the query string if return code is an error code +fn (mut re RE) impl_compile(in_txt string) (int, int) { + mut i := 0 // input string index + mut pc := 0 // program counter + + // group management variables + mut group_count := -1 + mut group_stack := []int{len: re.group_max_nested, init: 0} + mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1} + mut group_stack_index := -1 + + re.query = in_txt // save the query string + + i = 0 + for i < in_txt.len { + mut char_tmp := u32(0) + mut char_len := 0 + // println("i: ${i:3d} ch: ${in_txt.str[i]:c}") + + char_tmp, char_len = re.get_char(in_txt, i) + + // + // check special cases: $ ^ + // + if char_len == 1 && i == 0 && byte(char_tmp) == `^` { + re.flag = regex.f_ms + i = i + char_len + continue + } + if char_len == 1 && i == (in_txt.len - 1) && byte(char_tmp) == `$` { + re.flag = regex.f_me + i = i + char_len + continue + } + + // ist_group_start + if char_len == 1 && pc >= 0 && byte(char_tmp) == `(` { + // check max groups allowed + if group_count > re.group_max { + return regex.err_groups_overflow, i + 1 + } + group_stack_index++ + + // check max nested groups allowed + if group_stack_index > re.group_max_nested { + return regex.err_groups_max_nested, i + 1 + } + + tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i) + + // manage question mark format error + if tmp_res < -1 { + return regex.err_group_qm_notation, next_i + } + + // println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") + i = next_i + + if cgroup_flag == true { + group_count++ + } + + // calculate the group id + // if it is a named group, recycle the group id + // NOTE: **** the group index is +1 because map return 0 when not found!! **** + mut group_id := group_count + if cgroup_name.len > 0 { + // println("GROUP NAME: ${cgroup_name}") + if cgroup_name in re.group_map { + group_id = re.group_map[cgroup_name] - 1 + group_count-- + } else { + re.group_map[cgroup_name] = group_id + 1 + } + } + + group_stack_txt_index[group_stack_index] = i + group_stack[group_stack_index] = pc + + re.prog[pc].ist = u32(0) | regex.ist_group_start + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + + // set the group id + if cgroup_flag == false { + // println("NO CAPTURE GROUP") + re.prog[pc].group_id = -1 + } else { + re.prog[pc].group_id = group_id + } + + pc = pc + 1 + continue + } + + // ist_group_end + if char_len == 1 && pc > 0 && byte(char_tmp) == `)` { + if group_stack_index < 0 { + return regex.err_group_not_balanced, i + 1 + } + + goto_pc := group_stack[group_stack_index] + group_stack_index-- + + re.prog[pc].ist = u32(0) | regex.ist_group_end + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + + re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need + re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data + + re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc + // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data + + pc = pc + 1 + i = i + char_len + continue + } + + // ist_dot_char match any char except the following token + if char_len == 1 && pc >= 0 && byte(char_tmp) == `.` { + re.prog[pc].ist = u32(0) | regex.ist_dot_char + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + pc = pc + 1 + i = i + char_len + continue + } + + // OR branch + if char_len == 1 && pc > 0 && byte(char_tmp) == `|` { + // two consecutive ist_dot_char are an error + if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { + return regex.err_syntax_error, i + } + re.prog[pc].ist = u32(0) | regex.ist_or_branch + pc = pc + 1 + i = i + char_len + continue + } + + // Quantifiers + if char_len == 1 && pc > 0 { + mut quant_flag := true + match byte(char_tmp) { + `?` { + // println("q: ${char_tmp:c}") + re.prog[pc - 1].rep_min = 0 + re.prog[pc - 1].rep_max = 1 + } + `+` { + // println("q: ${char_tmp:c}") + re.prog[pc - 1].rep_min = 1 + re.prog[pc - 1].rep_max = regex.max_quantifier + } + `*` { + // println("q: ${char_tmp:c}") + re.prog[pc - 1].rep_min = 0 + re.prog[pc - 1].rep_max = regex.max_quantifier + } + `{` { + min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1) + // it is a quantifier + if min >= 0 { + // println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy") + i = i + tmp + re.prog[pc - 1].rep_min = min + re.prog[pc - 1].rep_max = max + re.prog[pc - 1].greedy = greedy + continue + } else { + return min, i + } + // TODO: decide if the open bracket can be conform without the close bracket + /* + // no conform, parse as normal char + else { + quant_flag = false + } + */ + } + else { + quant_flag = false + } + } + + if quant_flag { + i = i + char_len + continue + } + } + + // IST_CHAR_CLASS_* + if char_len == 1 && pc >= 0 { + if byte(char_tmp) == `[` { + cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1) + if cc_index >= 0 { + // println("index: $cc_index str:${in_txt[i..i+tmp]}") + i = i + tmp + re.prog[pc].ist = u32(0) | cc_type + re.prog[pc].cc_index = cc_index + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + pc = pc + 1 + continue + } + // cc_class vector memory full + else if cc_index < 0 { + return cc_index, i + } + } + } + + // ist_bsls_char + if char_len == 1 && pc >= 0 { + if byte(char_tmp) == `\\` { + bsls_index, tmp := re.parse_bsls(in_txt, i) + // println("index: $bsls_index str:${in_txt[i..i+tmp]}") + if bsls_index >= 0 { + i = i + tmp + re.prog[pc].ist = u32(0) | regex.ist_bsls_char + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator + re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch + pc = pc + 1 + continue + } + // this is an escape char, skip the bsls and continue as a normal char + else if bsls_index == regex.no_match_found { + i += char_len + char_tmp, char_len = re.get_char(in_txt, i) + // continue as simple char + } + // if not an escape or a bsls char then it is an error (at least for now!) + else { + return bsls_index, i + tmp + } + } + } + + // ist_simple_char + re.prog[pc].ist = regex.ist_simple_char + re.prog[pc].ch = char_tmp + re.prog[pc].ch_len = byte(char_len) + re.prog[pc].rep_min = 1 + re.prog[pc].rep_max = 1 + // println("char: ${char_tmp:c}") + pc = pc + 1 + + i += char_len + } + + // add end of the program + re.prog[pc].ist = regex.ist_prog_end + re.prog_len = pc + + // check for unbalanced groups + if group_stack_index != -1 { + return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1 + } + + // check for OR at the end of the program + if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { + return regex.err_syntax_error, in_txt.len + } + + // store the number of groups in the query + re.group_count = group_count + 1 + + //****************************************** + // Post processing + //****************************************** + + // + // manage ist_dot_char + // + + // find the checks for dot chars, if any... + mut pc1 := 0 + mut dot_char_count := 0 + mut last_dot_char_pc := -1 + for pc1 < pc { + if re.prog[pc1].ist == regex.ist_dot_char { + // println("Dot_char pc: $pc1") + last_dot_char_pc = pc1 + dot_char_count++ + mut pc2 := pc1 + 1 + for pc2 < pc { + if re.prog[pc2].ist == regex.ist_dot_char { + return regex.err_syntax_error, 0 + } + if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end, + regex.ist_group_start, + ] { + // println("Next dot char check is PC: ${pc2}") + re.prog[pc1].dot_check_pc = pc2 + break + } + pc2++ + } + } + pc1++ + } + + // println("last_dot_char_pc: $last_dot_char_pc") + if last_dot_char_pc >= 0 { + pc1 = last_dot_char_pc + 1 + mut is_last_dot := true + for pc1 < pc { + if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] { + is_last_dot = false + break + } + pc1++ + } + if is_last_dot { + re.prog[last_dot_char_pc].last_dot_flag = true + } + } + + //****************************************** + + // OR branch + // a|b|cd + // d exit point + // a,b,c branches + // set the jump in the right places + pc1 = 0 + for pc1 < pc - 2 { + // println("Here $pc1 ${pc-2}") + // two consecutive OR are a syntax error + if re.prog[pc1 + 1].ist == regex.ist_or_branch + && re.prog[pc1 + 2].ist == regex.ist_or_branch { + return regex.err_syntax_error, i + } + + // manange a|b chains like a|(b)|c|d... + // standard solution + if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch + && re.prog[pc1 + 2].ist != regex.ist_or_branch { + re.prog[pc1].next_is_or = true // set that the next token is an OR + re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump + + // match jump, if an OR chain the next token will be an OR token + mut pc2 := pc1 + 2 + for pc2 < pc - 1 { + ist := re.prog[pc2].ist + if ist == regex.ist_group_start { + re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1 + break + } + if ist != regex.ist_or_branch { + re.prog[pc1 + 1].rep_max = pc2 + 1 + break + } + + pc2++ + } + // special case query of few chars, teh true can't go on the first instruction + if re.prog[pc1 + 1].rep_max == pc1 { + re.prog[pc1 + 1].rep_max = 3 + } + // println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]") + pc1 = pc2 + continue + } + + pc1++ + } + + //****************************************** + // DEBUG PRINT REGEX GENERATED CODE + //****************************************** + if re.debug > 0 { + gc := re.get_code() + re.log_func(gc) + } + //****************************************** + + return regex.compile_ok, 0 +} + +// get_code return the compiled code as regex string, note: may be different from the source! +pub fn (re RE) get_code() string { + mut pc1 := 0 + mut res := strings.new_builder(re.cc.len * 2 * re.prog.len) + res.write_string('========================================\nv RegEx compiler v $regex.v_regex_version output:\n') + + mut stop_flag := false + + for pc1 <= re.prog.len { + tk := re.prog[pc1] + res.write_string('PC:${pc1:3d}') + + res.write_string(' ist: ') + res.write_string('${tk.ist:8x}'.replace(' ', '0')) + res.write_string(' ') + ist := tk.ist + if ist == regex.ist_bsls_char { + res.write_string('[\\${tk.ch:1c}] BSLS') + } else if ist == regex.ist_prog_end { + res.write_string('PROG_END') + stop_flag = true + } else if ist == regex.ist_or_branch { + res.write_string('OR ') + } else if ist == regex.ist_char_class_pos { + res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS') + } else if ist == regex.ist_char_class_neg { + res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG') + } else if ist == regex.ist_dot_char { + res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc') + if tk.last_dot_flag == true { + res.write_string(' last!') + } + } else if ist == regex.ist_group_start { + res.write_string('( GROUP_START #:$tk.group_id') + if tk.group_id == -1 { + res.write_string(' ?:') + } else { + for x in re.group_map.keys() { + if re.group_map[x] == (tk.group_id + 1) { + res.write_string(' ?P<$x>') + break + } + } + } + } else if ist == regex.ist_group_end { + res.write_string(') GROUP_END #:$tk.group_id') + } else if ist == regex.ist_simple_char { + res.write_string('[${tk.ch:1c}] query_ch') + } + + if tk.rep_max == regex.max_quantifier { + res.write_string(' {${tk.rep_min:3d},MAX}') + } else { + if ist == regex.ist_or_branch { + res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}') + } else { + res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}') + } + if tk.greedy == true { + res.write_string('?') + } + } + + res.write_string('\n') + if stop_flag { + break + } + pc1++ + } + + res.write_string('========================================\n') + return res.str() +} + +// get_query return a string with a reconstruction of the query starting from the regex program code +pub fn (re RE) get_query() string { + mut res := strings.new_builder(re.query.len * 2) + + if (re.flag & regex.f_ms) != 0 { + res.write_string('^') + } + + mut i := 0 + for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 { + tk := unsafe { &re.prog[i] } + ch := tk.ist + + // GROUP start + if ch == regex.ist_group_start { + if re.debug == 0 { + res.write_string('(') + } else { + if tk.group_id == -1 { + res.write_string('(?:') // non capturing group + } else { + res.write_string('#${tk.group_id}(') + } + } + + for x in re.group_map.keys() { + if re.group_map[x] == (tk.group_id + 1) { + res.write_string('?P<$x>') + break + } + } + + i++ + continue + } + + // GROUP end + if ch == regex.ist_group_end { + res.write_string(')') + } + + // OR branch + if ch == regex.ist_or_branch { + res.write_string('|') + if re.debug > 0 { + res.write_string('{$tk.rep_min,$tk.rep_max}') + } + i++ + continue + } + + // char class + if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos { + res.write_string('[') + if ch == regex.ist_char_class_neg { + res.write_string('^') + } + res.write_string('${re.get_char_class(i)}') + res.write_string(']') + } + + // bsls char + if ch == regex.ist_bsls_char { + res.write_string('\\${tk.ch:1c}') + } + + // ist_dot_char + if ch == regex.ist_dot_char { + res.write_string('.') + } + + // char alone + if ch == regex.ist_simple_char { + if byte(ch) in regex.bsls_escape_list { + res.write_string('\\') + } + res.write_string('${tk.ch:c}') + } + + // quantifier + if !(tk.rep_min == 1 && tk.rep_max == 1) { + if tk.rep_min == 0 && tk.rep_max == 1 { + res.write_string('?') + } else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier { + res.write_string('+') + } else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier { + res.write_string('*') + } else { + if tk.rep_max == regex.max_quantifier { + res.write_string('{$tk.rep_min,MAX}') + } else { + res.write_string('{$tk.rep_min,$tk.rep_max}') + } + if tk.greedy == true { + res.write_string('?') + } + } + } + i++ + } + if (re.flag & regex.f_me) != 0 { + res.write_string('$') + } + + return res.str() +} + +/****************************************************************************** +* +* Groups saving utilities +* +******************************************************************************/ +[direct_array_access] +fn (mut re RE) group_continuous_save(g_index int) { + if re.group_csave_flag == true { + // continuous save, save until we have space + + // init the first element as counter + if re.group_csave.len == 0 { + re.group_csave << 0 + } + + gi := g_index >> 1 + start := re.groups[g_index] + end := re.groups[g_index + 1] + + // check if we are simply increasing the size ot the found group + if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3] + && start == re.group_csave[re.group_csave.len - 2] { + re.group_csave[re.group_csave.len - 1] = end + return + } + + // otherwise append a new group to the list + + // increment counter + re.group_csave[0]++ + // save the record + re.group_csave << (g_index >> 1) // group id + re.group_csave << re.groups[g_index] // start + re.group_csave << re.groups[g_index + 1] // end + } +} + +/****************************************************************************** +* +* Matching +* +******************************************************************************/ +enum Match_state { + start = 0 + stop + end + new_line + ist_load // load and execute instruction + ist_next // go to next instruction + ist_next_ks // go to next instruction without clenaning the state + ist_quant_p // match positive ,quantifier check + ist_quant_n // match negative, quantifier check + ist_quant_pg // match positive ,group quantifier check + ist_quant_ng // match negative ,group quantifier check +} + +fn state_str(s Match_state) string { + match s { + .start { return 'start' } + .stop { return 'stop' } + .end { return 'end' } + .new_line { return 'new line' } + .ist_load { return 'ist_load' } + .ist_next { return 'ist_next' } + .ist_next_ks { return 'ist_next_ks' } + .ist_quant_p { return 'ist_quant_p' } + .ist_quant_n { return 'ist_quant_n' } + .ist_quant_pg { return 'ist_quant_pg' } + .ist_quant_ng { return 'ist_quant_ng' } + } +} + +struct StateObj { +pub mut: + group_index int = -1 // group id used to know how many groups are open + match_flag bool // indicate if we are in a match condition + match_index int = -1 // index of the last match + first_match int = -1 // index of the first match + pc int = -1 // program counter + i int = -1 // source string index + char_len int // last char legth + last_dot_pc int = -1 // last dot chat pc +} + +[direct_array_access] +pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { + // result status + mut result := regex.no_match_found // function return + + mut ch := rune(0) // examinated char + mut char_len := 0 // utf8 examinated char len + mut m_state := Match_state.start // start point for the matcher FSM + mut src_end := false + mut last_fnd_pc := -1 + + mut state := StateObj{} // actual state + mut ist := rune(0) // actual instruction + mut l_ist := rune(0) // last matched instruction + + mut step_count := 0 // stats for debug + mut dbg_line := 0 // count debug line printed + + re.reset() + + if re.debug > 0 { + // print header + mut h_buf := strings.new_builder(32) + h_buf.write_string('flags: ') + h_buf.write_string('${re.flag:8x}'.replace(' ', '0')) + h_buf.write_string('\n') + sss := h_buf.str() + re.log_func(sss) + } + + for m_state != .end { + if state.pc >= 0 && state.pc < re.prog.len { + ist = re.prog[state.pc].ist + } else if state.pc >= re.prog.len { + // println("ERROR!! PC overflow!!") + return regex.err_internal_error, state.i + } + + //****************************************** + // DEBUG LOG + //****************************************** + if re.debug > 0 { + mut buf2 := strings.new_builder(re.cc.len + 128) + + // print all the instructions + + // end of the input text + if state.i >= in_txt_len { + buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n') + sss := buf2.str() + re.log_func(sss) + } else { + // print only the exe instruction + if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { + if ist == regex.ist_prog_end { + buf2.write_string('# ${step_count:3d} PROG_END\n') + } else if ist == 0 || m_state in [.start, .ist_next, .stop] { + buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n') + } else { + ch, char_len = re.get_charb(in_txt, state.i) + + buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>') + buf2.write_string('${ist:8x}'.replace(' ', '0')) + buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',$char_len] f.m:[${state.first_match:3d},${state.match_index:3d}] ") + + if ist == regex.ist_simple_char { + buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]') + } else { + if ist == regex.ist_bsls_char { + buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]') + } else if ist == regex.ist_prog_end { + buf2.write_string('PROG_END') + } else if ist == regex.ist_or_branch { + buf2.write_string('OR') + } else if ist == regex.ist_char_class_pos { + buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]') + } else if ist == regex.ist_char_class_neg { + buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]') + } else if ist == regex.ist_dot_char { + buf2.write_string('DOT_CHAR') + } else if ist == regex.ist_group_start { + tmp_gi := re.prog[state.pc].group_id + tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep + buf2.write_string('GROUP_START #:$tmp_gi rep:$tmp_gr ') + } else if ist == regex.ist_group_end { + buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:$state.group_index') + } + } + if re.prog[state.pc].rep_max == regex.max_quantifier { + buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}') + } else { + buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}') + } + if re.prog[state.pc].greedy == true { + buf2.write_string('?') + } + buf2.write_string(' (#$state.group_index)') + + if ist == regex.ist_dot_char { + buf2.write_string(' last!') + } + + buf2.write_string('\n') + } + sss2 := buf2.str() + re.log_func(sss2) + } + } + step_count++ + dbg_line++ + } + //****************************************** + + if ist == regex.ist_prog_end { + // println("HERE we end!") + break + } + + // we're out of text, manage it + if state.i >= in_txt_len || m_state == .new_line { + // println("Finished text!!") + src_end = true + + // manage groups + if state.group_index >= 0 && state.match_index >= 0 { + // println("End text with open groups!") + // close the groups + for state.group_index >= 0 { + tmp_pc := re.group_data[state.group_index] + re.prog[tmp_pc].group_rep++ + // println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}") + + if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min + && re.prog[tmp_pc].group_id >= 0 { + start_i := re.group_stack[state.group_index] + re.group_stack[state.group_index] = -1 + + // save group results + g_index := re.prog[tmp_pc].group_id * 2 + if start_i >= 0 { + re.groups[g_index] = start_i + } else { + re.groups[g_index] = 0 + } + // we have fished the text, we must manage out pf bound indexes + if state.i >= in_txt_len { + state.i = in_txt_len - 1 + } + re.groups[g_index + 1] = state.i + + if re.groups[g_index + 1] >= in_txt_len { + // println("clamp group on stop!") + re.groups[g_index + 1] = in_txt_len - 1 + } + + // continuous save, save until we have space + re.group_continuous_save(g_index) + } + state.group_index-- + } + } + + // the text is finished and the groups closed and we are the last group, ok exit + if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end { + // println("Last group end") + return state.first_match, state.i + } + + if state.pc == -1 { + state.pc = last_fnd_pc + } + + // println("Finished text!!") + // println("Instruction: ${ist:08x} pc: $state.pc") + // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}") + + // program end + if ist == regex.ist_prog_end { + // println("Program end on end of text!") + return state.first_match, state.i + } + + // we are in a last dot_ char case + if l_ist == regex.ist_dot_char { + // println("***** We have a last dot_char") + // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}") + // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}") + // println("first match: ${state.first_match}") + if re.prog[state.pc].last_dot_flag == true + && re.prog[state.pc].rep >= re.prog[state.pc].rep_min + && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { + return state.first_match, state.i + } + // println("Not fitted!!") + } + + // m_state = .end + // break + return regex.no_match_found, 0 + } + + // starting and init + if m_state == .start { + state.pc = -1 + state.i = 0 + m_state = .ist_next + continue + } + // ist_next, next instruction reseting its state + else if m_state == .ist_next { + state.pc = state.pc + 1 + re.prog[state.pc].reset() + // check if we are in the program bounds + if state.pc < 0 || state.pc > re.prog.len { + // println("ERROR!! PC overflow!!") + return regex.err_internal_error, state.i + } + m_state = .ist_load + continue + } + // ist_next_ks, next instruction keeping its state + else if m_state == .ist_next_ks { + state.pc = state.pc + 1 + // check if we are in the program bounds + if state.pc < 0 || state.pc > re.prog.len { + // println("ERROR!! PC overflow!!") + return regex.err_internal_error, state.i + } + m_state = .ist_load + continue + } + + // load the char + ch, char_len = re.get_charb(in_txt, state.i) + + // check new line if flag f_nl enabled + if (re.flag & regex.f_nl) != 0 && char_len == 1 && byte(ch) in regex.new_line_list { + m_state = .new_line + continue + } + // check if stop + else if m_state == .stop { + // we are in search mode, don't exit until the end + if ((re.flag & regex.f_src) != 0) && (ist != regex.ist_prog_end) { + last_fnd_pc = state.pc + state.pc = -1 + state.i += char_len + + m_state = .ist_next + re.reset_src() + state.match_index = -1 + state.first_match = -1 + + // reset state list + re.reset() + + continue + } + + if ist == regex.ist_prog_end { + return state.first_match, state.i + } + + // manage here dot char + + if re.state_list.len > 0 { + // println("Here we are, with stop: state buffer: [${re.state_list.len}]") + state = re.state_list.pop() + + state.match_flag = true + l_ist = u32(regex.ist_dot_char) + + if state.first_match < 0 { + state.first_match = state.i + } + state.match_index = state.i + re.prog[state.pc].rep++ // increase repetitions + + state.i += char_len + m_state = .ist_quant_p + continue + } + + // exit on no match + return result, 0 + } + // ist_load + else if m_state == .ist_load { + // program end + if ist == regex.ist_prog_end { + // if we are in match exit well + + if state.group_index >= 0 && state.match_index >= 0 { + state.group_index = -1 + } + + m_state = .stop + continue + } + // check GROUP start, no quantifier is checkd for this token!! + else if ist == regex.ist_group_start { + state.group_index++ + re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape + re.group_stack[state.group_index] = state.i // index where we start to manage + // println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}") + + m_state = .ist_next + continue + } + // check GROUP end + else if ist == regex.ist_group_end { + // we are in matching streak + // println("Group END!! last ist: ${l_ist:08x}") + if state.match_index >= 0 { + // restore txt index stack and save the group data + + // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") + if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { + start_i := re.group_stack[state.group_index] + + // save group results + g_index := re.prog[state.pc].group_id * 2 + + if start_i >= 0 { + re.groups[g_index] = start_i + } else { + re.groups[g_index] = 0 + } + + re.groups[g_index + 1] = state.i + + if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] { + re.groups[g_index] = re.groups[g_index - 1] + } + + if re.groups[g_index + 1] >= in_txt_len { + // println("clamp group!") + re.groups[g_index + 1] = in_txt_len - 1 + } + + // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len") + + // continuous save, save until we have space + re.group_continuous_save(g_index) + } + + re.prog[state.pc].group_rep++ // increase repetitions + // println("GROUP $group_index END ${re.prog[state.pc].group_rep}") + m_state = .ist_quant_pg + continue + } + + m_state = .ist_quant_ng + continue + } + // check OR + else if ist == regex.ist_or_branch { + if state.match_index >= 0 { + state.pc = re.prog[state.pc].rep_max + // println("ist_or_branch True pc: $state.pc") + } else { + state.pc = re.prog[state.pc].rep_min + // println("ist_or_branch False pc: $state.pc") + } + re.prog[state.pc].reset() + m_state = .ist_load + continue + } + // check ist_dot_char + else if ist == regex.ist_dot_char { + // println("ist_dot_char rep: ${re.prog[state.pc].rep}") + + // check next token to be false + mut next_check_flag := false + + // if we are done with max go on dot char are dedicated case!! + if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { + re.state_list.pop() + m_state = .ist_next + continue + } + + if re.prog[state.pc].dot_check_pc >= 0 + && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { + // load the char + // ch_t, _ := re.get_charb(in_txt, state.i+char_len) + ch_t := ch + chk_pc := re.prog[state.pc].dot_check_pc + + // simple char + if re.prog[chk_pc].ist == regex.ist_simple_char { + if re.prog[chk_pc].ch == ch_t { + next_check_flag = true + } + // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") + } + // char char_class + else if re.prog[chk_pc].ist == regex.ist_char_class_pos + || re.prog[chk_pc].ist == regex.ist_char_class_neg { + mut cc_neg := false + if re.prog[chk_pc].ist == regex.ist_char_class_neg { + cc_neg = true + } + mut cc_res := re.check_char_class(chk_pc, ch_t) + + if cc_neg { + cc_res = !cc_res + } + next_check_flag = cc_res + // println("Check [ist_char_class] => $next_check_flag") + } + // check bsls + else if re.prog[chk_pc].ist == regex.ist_bsls_char { + next_check_flag = re.prog[chk_pc].validator(byte(ch_t)) + // println("Check [ist_bsls_char] => $next_check_flag") + } + } + + // check if we must continue or pass to the next IST + if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end { + // println("save the state!!") + mut dot_state := StateObj{ + group_index: state.group_index + match_flag: state.match_flag + match_index: state.match_index + first_match: state.first_match + pc: state.pc + i: state.i + char_len + char_len: char_len + last_dot_pc: state.pc + } + // if we are mananging a .* stay on the same char on return + if re.prog[state.pc].rep_min == 0 { + dot_state.i -= char_len + } + + re.state_list << dot_state + + m_state = .ist_quant_n + // println("dot_char stack len: ${re.state_list.len}") + continue + } + + state.match_flag = true + l_ist = u32(regex.ist_dot_char) + + if state.first_match < 0 { + state.first_match = state.i + } + state.match_index = state.i + re.prog[state.pc].rep++ // increase repetitions + + state.i += char_len + m_state = .ist_quant_p + continue + } + // char class IST + else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg { + state.match_flag = false + mut cc_neg := false + + if ist == regex.ist_char_class_neg { + cc_neg = true + } + mut cc_res := re.check_char_class(state.pc, ch) + + if cc_neg { + cc_res = !cc_res + } + + if cc_res { + state.match_flag = true + l_ist = u32(regex.ist_char_class_pos) + + if state.first_match < 0 { + state.first_match = state.i + } + + state.match_index = state.i + + re.prog[state.pc].rep++ // increase repetitions + state.i += char_len // next char + m_state = .ist_quant_p + continue + } + m_state = .ist_quant_n + continue + } + // check bsls + else if ist == regex.ist_bsls_char { + state.match_flag = false + tmp_res := re.prog[state.pc].validator(byte(ch)) + // println("BSLS in_ch: ${ch:c} res: $tmp_res") + if tmp_res { + state.match_flag = true + l_ist = u32(regex.ist_bsls_char) + + if state.first_match < 0 { + state.first_match = state.i + } + + state.match_index = state.i + + re.prog[state.pc].rep++ // increase repetitions + state.i += char_len // next char + m_state = .ist_quant_p + continue + } + m_state = .ist_quant_n + continue + } + // simple char IST + else if ist == regex.ist_simple_char { + // println("ist_simple_char") + state.match_flag = false + + if re.prog[state.pc].ch == ch { + state.match_flag = true + l_ist = regex.ist_simple_char + + if state.first_match < 0 { + state.first_match = state.i + } + // println("state.match_index: ${state.match_index}") + state.match_index = state.i + + re.prog[state.pc].rep++ // increase repetitions + state.i += char_len // next char + m_state = .ist_quant_p + continue + } + m_state = .ist_quant_n + continue + } + // UNREACHABLE + // println("PANIC2!! state: $m_state") + return regex.err_internal_error, state.i + } + /*********************************** + * Quantifier management + ***********************************/ + // ist_quant_ng => quantifier negative test on group + else if m_state == .ist_quant_ng { + // we are finished here + if state.group_index < 0 { + // println("Early stop!") + result = regex.no_match_found + m_state = .stop + continue + } + + tmp_pc := re.group_data[state.group_index] // PC to the end of the group token + rep := re.prog[tmp_pc].group_rep // use a temp variable + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + + // println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep") + + if rep >= re.prog[tmp_pc].rep_min { + // println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index") + + state.i = re.group_stack[state.group_index] + state.pc = tmp_pc + state.group_index-- + m_state = .ist_next + continue + } else if re.prog[tmp_pc].next_is_or { + // println("ist_quant_ng OR Negative branch") + + state.i = re.group_stack[state.group_index] + state.pc = re.prog[tmp_pc + 1].rep_min - 1 + state.group_index-- + m_state = .ist_next + continue + } else if rep > 0 && rep < re.prog[tmp_pc].rep_min { + // println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index") + + // check if we are inside a group, if yes exit from the nested groups + if state.group_index > 0 { + state.group_index-- + state.pc = tmp_pc + m_state = .ist_quant_ng //.ist_next + continue + } + + if state.group_index == 0 { + state.group_index-- + state.pc = tmp_pc // TEST + m_state = .ist_next + continue + } + + result = regex.no_match_found + m_state = .stop + continue + } else if rep == 0 && rep < re.prog[tmp_pc].rep_min { + // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index") + + if state.group_index > 0 { + state.group_index-- + state.pc = tmp_pc + m_state = .ist_quant_ng //.ist_next + continue + } + + result = regex.no_match_found + m_state = .stop + continue + } + + // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep") + // UNREACHABLE + return regex.err_internal_error, state.i + } + // ist_quant_pg => quantifier positive test on group + else if m_state == .ist_quant_pg { + // println(".ist_quant_pg") + mut tmp_pc := state.pc + if state.group_index >= 0 { + tmp_pc = re.group_data[state.group_index] + } + + rep := re.prog[tmp_pc].group_rep + + if rep < re.prog[tmp_pc].rep_min { + // println("ist_quant_pg UNDER RANGE") + state.pc = re.prog[tmp_pc].goto_pc + m_state = .ist_next + continue + } else if rep == re.prog[tmp_pc].rep_max { + // println("ist_quant_pg MAX RANGE") + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + state.group_index-- + m_state = .ist_next + + continue + } else if rep >= re.prog[tmp_pc].rep_min { + // println("ist_quant_pg IN RANGE group_index:$state.group_index") + + // check greedy flag, if true exit on minimum + if re.prog[tmp_pc].greedy == true { + re.prog[tmp_pc].group_rep = 0 // clear the repetitions + state.group_index-- + m_state = .ist_next + continue + } + + state.pc = re.prog[tmp_pc].goto_pc - 1 + state.group_index-- + m_state = .ist_next + continue + } + + // UNREACHABLE + // println("PANIC3!! state: $m_state") + return regex.err_internal_error, state.i + } + // ist_quant_n => quantifier negative test on token + else if m_state == .ist_quant_n { + rep := re.prog[state.pc].rep + // println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}") + + // zero quantifier * or ? + if rep == 0 && re.prog[state.pc].rep_min == 0 { + // println("ist_quant_n c_zero RANGE MIN") + m_state = .ist_next // go to next ist + continue + } + // match + or * + else if rep >= re.prog[state.pc].rep_min { + // println("ist_quant_n MATCH RANGE") + m_state = .ist_next + continue + } + + // check the OR if present + if re.prog[state.pc].next_is_or { + // println("OR present on failing") + state.match_index = -1 + m_state = .ist_next + continue + } + + // we are in a group manage no match from here + if state.group_index >= 0 { + // println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index") + m_state = .ist_quant_ng + continue + } + + // no other options + // println("ist_quant_n no_match_found") + result = regex.no_match_found + m_state = .stop + continue + // return no_match_found, 0 + } + // ist_quant_p => quantifier positive test on token + else if m_state == .ist_quant_p { + // exit on first match + if (re.flag & regex.f_efm) != 0 { + return state.i, state.i + 1 + } + + rep := re.prog[state.pc].rep + + // under range + if rep > 0 && rep < re.prog[state.pc].rep_min { + // println("ist_quant_p UNDER RANGE") + m_state = .ist_load // continue the loop + continue + } + // range ok, continue loop + else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max { + // println("ist_quant_p IN RANGE") + + // check greedy flag, if true exit on minimum + if re.prog[state.pc].greedy == true { + m_state = .ist_next + continue + } + m_state = .ist_load + continue + } + // max reached + else if rep == re.prog[state.pc].rep_max { + // println("ist_quant_p MAX RANGE") + m_state = .ist_next + continue + } + } + // UNREACHABLE + // println("PANIC4!! state: $m_state") + return regex.err_internal_error, state.i + } + + // println("Check end of text!") + // Check the results + if state.match_index >= 0 { + if state.group_index < 0 { + if re.prog[state.pc].ist == regex.ist_prog_end { + // println("program ended!!") + + if (re.flag & regex.f_src) != 0 { + // println("find return") + return state.first_match, state.i + } else { + // println("Here!!") + return 0, state.i + } + } + + // println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len") + + if re.prog[state.pc + 1].ist == regex.ist_prog_end + || re.prog[state.pc].ist == regex.ist_prog_end { + rep := re.prog[state.pc].rep + // println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}") + if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max { + return state.first_match, state.i + } + // println("Program not finished! ") + return regex.no_match_found, 0 + } + if src_end { + // println("program end") + return state.first_match, state.i + } + // print("No match found!!") + return regex.no_match_found, 0 + } else { + // println("Group match! OK") + // println("first_match: $state.first_match, i: $state.i") + + // println("Skip last group") + return state.first_match, state.i + // return state.first_match,re.group_stack[state.group_index--] + } + } + // println("no_match_found, natural end") + return regex.no_match_found, 0 +} diff --git a/v_windows/v/vlib/regex/regex_opt.v b/v_windows/v/vlib/regex/regex_opt.v new file mode 100644 index 0000000..2aaa88f --- /dev/null +++ b/v_windows/v/vlib/regex/regex_opt.v @@ -0,0 +1,53 @@ +module regex + +import strings + +// compile_opt compile RE pattern string +pub fn (mut re RE) compile_opt(pattern string) ? { + re_err, err_pos := re.impl_compile(pattern) + + if re_err != compile_ok { + mut err_msg := strings.new_builder(300) + err_msg.write_string('\nquery: $pattern\n') + line := '-'.repeat(err_pos) + err_msg.write_string('err : $line^\n') + err_str := re.get_parse_error_string(re_err) + err_msg.write_string('ERROR: $err_str\n') + return error_with_code(err_msg.str(), re_err) + } +} + +// new_regex create a RE of small size, usually sufficient for ordinary use +pub fn new() RE { + // init regex + mut re := RE{} + re.prog = []Token{len: max_code_len + 1} // max program length, can not be longer then the pattern + re.cc = []CharClass{len: max_code_len} // can not be more char class the the length of the pattern + re.group_csave_flag = false // enable continuos group saving + re.group_max_nested = 128 // set max 128 group nested + re.group_max = max_code_len >> 1 // we can't have more groups than the half of the pattern legth + + re.group_stack = []int{len: re.group_max, init: -1} + re.group_data = []int{len: re.group_max, init: -1} + + return re +} + +// regex_opt create new RE object from RE pattern string +pub fn regex_opt(pattern string) ?RE { + // init regex + mut re := RE{} + re.prog = []Token{len: pattern.len + 1} // max program length, can not be longer then the pattern + re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern + re.group_csave_flag = false // enable continuos group saving + re.group_max_nested = 128 // set max 128 group nested + re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth + + re.group_stack = []int{len: re.group_max, init: -1} + re.group_data = []int{len: re.group_max, init: -1} + + // compile the pattern + re.compile_opt(pattern) ? + + return re +} diff --git a/v_windows/v/vlib/regex/regex_test.v b/v_windows/v/vlib/regex/regex_test.v new file mode 100644 index 0000000..aa6bf79 --- /dev/null +++ b/v_windows/v/vlib/regex/regex_test.v @@ -0,0 +1,608 @@ +import regex +import rand + +/****************************************************************************** +* +* Test section +* +******************************************************************************/ +struct TestItem { + src string + q string + s int + e int +} + +const( +match_test_suite = [ + // minus in CC + TestItem{"d.def",r"abc.\.[\w\-]{,100}",-1,0}, + TestItem{"abc12345.asd",r"abc.\.[\w\-]{,100}",-1,0}, + TestItem{"abca.exe",r"abc.\.[\w\-]{,100}",0,8}, + TestItem{"abc2.exe-test_12",r"abc.\.[\w\-]{,100}",0,16}, + TestItem{"abcdefGHK",r"[a-f]+\A+",0,9}, + TestItem{"ab-cd-efGHK",r"[a-f\-g]+\A+",0,11}, + + // base OR + TestItem{"a",r"a|b",0,1}, + TestItem{"a",r"b|a",0,1}, + TestItem{"b",r"a|b",0,1}, + TestItem{"b",r"b|a",0,1}, + TestItem{"c",r"b|a",-1,0}, + + // test base + TestItem{"[ciao]",r"(.)ciao(.)",0,6}, + TestItem{"[ciao] da me",r"(.)ciao(.)",0,6}, + + // positive + TestItem{"this is a good.",r"this",0,4}, + TestItem{"this is a good.",r"good",10,14}, + TestItem{"this is a good.",r"go+d",10,14}, + TestItem{"this is a good.",r"g[oae]+d",10,14}, + TestItem{"this is a goed.",r"g[oae]+d",10,14}, + TestItem{"this is a good.",r"g[oae]*d",10,14}, + TestItem{"this is a goaezd.",r"g[ea-cm-z]*d",10,16}, + TestItem{"this is a good.",r"this (\w+) a",0,9}, + TestItem{"this is a good.",r"this( \w+){2} g",0,11}, + TestItem{"this is a good.",r"( ?\w+){,1}",0,4}, + TestItem{"this is a good.",r"( ?\w+)+",0,14}, + TestItem{"this is a good.",r"this( \w+)+",0,14}, + TestItem{"this is a good sample.",r"( ?\w+){,2}",0,7}, + TestItem{"this is a good sample.",r"( ?\w+){,3}",0,9}, + TestItem{"this is a good sample.",r"( ?\w+){,4}",0,14}, + TestItem{"this is a good sample.",r"( ?\w+){,5}",0,21}, + TestItem{"this is a good sample.",r"( ?\w+){2,3}",0,9}, + TestItem{"this is a good sample.",r"(\s?\w+){2,3}",0,9}, + TestItem{"this these those.",r"(th[ei]se?\s|\.)+",0,11}, + TestItem{"this these those ",r"(th[eio]se? ?)+",0,17}, + TestItem{"this these those ",r"(th[eio]se? )+",0,17}, + TestItem{"this,these,those. over",r"(th[eio]se?[,. ])+",0,17}, + TestItem{"soday,this,these,those. over",r".+(th[eio]se?[,. ])+",0,23}, + + TestItem{"cpapaz",r"(c(pa)+z)",0,6}, + TestItem{"this is a cpapaz over",r"(c(pa)+z)",10,16}, + TestItem{"this is a cpapapez over",r"(c(p[ae])+z)",10,18}, + TestItem{"test@post.pip.com",r"[a-z0-9_]+@([a-z0-9_]+\.?)+",0,17}, + TestItem{"test1@post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",0,18}, + TestItem{"pippo@pera.com ",r"[a-z0-9_]+@([a-z0-9_]+\.?)+",0,14}, + TestItem{"adce aabe",r"(a(ab)+)|(a(dc)+)e",0,4}, + TestItem{"zadce aabe",r"(a(ab)+)|(a(dc)+)e",1,5}, + TestItem{"abbz accz addz.",r"c|(d)|e|(ab+)",0,3}, + TestItem{"this those these ciao",r"((t[hieo]+se?)\s*)+",0,17}, + TestItem{"this ciao",r"((t[hieo]+se?)\s*)+",0,5}, + TestItem{"this cpapaz adce aabe",r"(c(pa)+z)(\s[\a]+){2}",5,21}, + TestItem{"1234this cpapaz adce aabe",r"(c(pa)+z)(\s[\a]+){2}$",9,25}, + TestItem{"this cpapaz adce aabe third",r"(c(pa)+z)(\s[\a]+){2}",5,21}, + TestItem{"123cpapaz ole. pippo",r"(c(pa)+z)(\s+\a+[\.,]?)+",3,20}, + + TestItem{"this is a good sample.",r".*i(\w)+",0,4}, + TestItem{"soday,this,these,those. over",r".*,(th[eio]se?[,. ])+",0,23}, + TestItem{"soday,this,these,thesa.thesi over",r".*,(th[ei]se?[,. ])+(thes[ai][,. ])+",0,29}, + TestItem{"cpapaz ole. pippo,",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18}, + TestItem{"cpapaz ole. pippo",r"(c(pa)+z)(\s+\a+[\.,]?)+",0,17}, + TestItem{"cpapaz ole. pippo, 852",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,18}, + TestItem{"123cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20}, + TestItem{"...cpapaz ole. pippo",r".*(c(pa)+z)(\s+\a+[\.,]?)+",0,20}, + + TestItem{"cpapaz ole. pippo,",r".*c.+ole.*pi",0,14}, + TestItem{"cpapaz ole. pipipo,",r".*c.+ole.*p([ip])+o",0,18}, + TestItem{"cpapaz ole. pipipo",r"^.*c.+ol?e.*p([ip])+o$",0,18}, + TestItem{"abbb",r"ab{2,3}?",0,3}, + TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11}, + TestItem{" abb",r"\s(.*)",0,4}, + + TestItem{"/home/us_er/pippo/info-01.txt", r"(/?[-\w_]+)*\.txt$",0,29} + + // negative + TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, + TestItem{"this is a good.",r"thes",-1,0}, + TestItem{"test1post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",-1,0}, + TestItem{"this cpapaz adce",r"(c(pa)+z)(\s[\a]+){2}",-1,0}, + TestItem{"this cpapaz adce aabe third",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, + TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, + TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0}, + TestItem{"/home/us_er/pippo/info-01.jpeg", r"(/?[-\w_]+)*\.txt$",-1,0} + + // check unicode + TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34}, + TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23}, + + // new edge cases + TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0}, + TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8}, + TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9} + TestItem{"12345678", r"^\d{8}$",0,8}, + TestItem{"12345678", r"^\d{7}$",-1,0}, + TestItem{"12345678", r"^\d{9}$",-1,0}, + + TestItem{"eth", r"(oth)|(eth)",0,3}, + TestItem{"et", r"(oth)|(eth)",-1,0}, + TestItem{"et", r".*(oth)|(eth)",-1,0}, + TestItem{"peoth", r".*(ith)|(eth)",-1,0}, + + TestItem{"poth", r"(eth)|(oth)",1,4}, + TestItem{"poth", r"(oth)|(eth)",1,4}, + TestItem{"poth", r".(oth)|(eth)$",0,4}, + TestItem{"poth", r"^.(oth)|(eth)$",0,4}, + TestItem{"poth", r"^\w+$",0,4}, + + // test dot_char + TestItem{"8-11 l: qllllqllklhlvtl", r"^(\d+)-(\d+) ([a-z]): (.*)$",0,23}, + TestItem{"accccb deer", r"^a(.*)b d(.+)r",0,11}, + TestItem{"accccb deer", r"^a(.*)b d(.+)",0,11}, + TestItem{"accccb deer", r"^(.*)$",0,11}, + TestItem{"accccb deer", r"^a(.*)b d(.+)p",-1,0}, + TestItem{"##.#....#.##.####...#.##", r".{18}[.#]",0,19}, + TestItem{"#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", r'.*#[.#]{4}##[.#]{4}##[.#]{4}###',0,49}, + + // test bcksls chars + TestItem{"[ an s. s! ]( wi4ki:something )", r"\[.*\]\( *(\w*:*\w+) *\)",0,31}, + TestItem{"[ an s. s! ](wiki:something)", r"\[.*\]\( *(\w*:*\w+) *\)",0,28}, + TestItem{"p_p", r"\w+",0,3}, + TestItem{"p_é", r"\w+",0,2}, + + // Crazywulf tests (?:^|[()])(\d+)(*)(\d+)(?:$|[()]) + TestItem{"1*1", r"(\d+)([*])(\d+)",0,3}, + TestItem{"+1*1", r"^(\d+)([*])(\d+)",-1,0}, + TestItem{"*1*1", r"(?:^|[*])(\d+)([*])(\d+)",0,4}, + TestItem{"*1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4}, + TestItem{")1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4}, + TestItem{"(1*1", r"(?:^|[*()])(\d+)([*])(\d+)",0,4}, + TestItem{"*1*1(", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",0,5}, + TestItem{" 1*1(", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",-1,0}, + TestItem{"1*1 ", r"(?:^|[*()])(\d+)([*])(\d+)(?:$|[*()])",-1,0}, + + // particular groups + TestItem{"ababababac", r"ab(.*)(ac)",0,10}, + +] +) + +struct TestItemRe { + src string + q string + rep string + r string +} +const ( +match_test_suite_replace = [ + // replace tests + TestItemRe{ + "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", + r"(pi?(ba)+o)", + "CIAO", + "oggi CIAO è andato a casa di CIAO ed ha trovato CIAO" + }, + TestItemRe{ + "Today is a good day and tomorrow will be for sure.", + r"[Tt]o\w+", + "CIAO", + "CIAO is a good day and CIAO will be for sure." + }, + TestItemRe{ + "Today is a good day and tomorrow will be for sure.", + r"(a\w) ", + r"[\0] ", + "Tod[ay] is a good d[ay] and tomorrow will be for sure." + }, + TestItemRe{ + "Today is a good day and tomorrow will be for sure.", + r"(a\w) ", + r"[\0_\0] ", + "Tod[ay_ay] is a good d[ay_ay] and tomorrow will be for sure." + }, + TestItemRe{ + "Today is a good day and tomorrow will be for sure.", + r"(a\w) ", + r"[\0\1] ", + "Tod[ay] is a good d[ay] and tomorrow will be for sure." + }, +] + +match_test_suite_replace_simple = [ + // replace tests + TestItemRe{ + "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", + r"(pi?(ba)+o)", + "CIAO", + "oggi CIAO è andato a casa di CIAO ed ha trovato CIAO" + }, + TestItemRe{ + "Today is a good day and tomorrow will be for sure.", + r"[Tt]o\w+", + "CIAO", + "CIAO is a good day and CIAO will be for sure." + }, +] +) + +struct TestItemCGroup { + src string + q string + s int + e int + cg []int // [number of items (3*# item), id_group_0, start_0, end_0, id_group_1, start1, start2,... ] + cgn map[string]int +} +const ( +cgroups_test_suite = [ + TestItemCGroup{ + "http://www.ciao.mondo/hello/pippo12_/pera.html", + r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+[\.|/])+",0,42, + [7, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42], + {'format':int(0),'token':1} + }, + TestItemCGroup{ + "http://www.ciao.mondo/hello/pippo12_/pera.html", + r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46, + [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46] + //[8, 0, 0, 4, 1, 7, 10, 1, 11, 15, 1, 16, 21, 1, 22, 27, 1, 28, 36, 1, 37, 41, 1, 42, 46], + {'format':int(0),'token':1} + }, + TestItemCGroup{ + "http://www.ciao.mondo/hello/pippo12_/pera.html", + r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+\.)+",0,16, + [3, 0, 0, 4, 1, 7, 11, 1, 11, 16], + {'format':int(0)} + }, + TestItemCGroup{ + "acc +13 pippo", + r"(\w+)\s(.)([0-9]+) \w+",0,13, + [0, 3, 4, 5, 5, 7], + map[string]int{} + }, + TestItemCGroup{ + "acc +13", + r"(\w+)\s(.)([0-9]+)",0,7, + [0, 3, 4, 5, 5, 7], + map[string]int{} + }, + TestItemCGroup{ + "ababababac", + r"ab(.*)(ac)",0,10, + [2, 8, 8, 10], + map[string]int{} + }, +] +) + + +struct Test_find_all { + src string + q string + res []int // [0,4,5,6...] + res_str []string // ['find0','find1'...] +} +const ( +find_all_test_suite = [ + Test_find_all{ + "abcd 1234 efgh 1234 ghkl1234 ab34546df", + r"\d+", + [5, 9, 15, 19, 24, 28, 31, 36], + ['1234', '1234', '1234', '34546'] + }, + Test_find_all{ + "abcd 1234 efgh 1234 ghkl1234 ab34546df", + r"\a+", + [0, 4, 10, 14, 20, 24, 29, 31, 36, 38], + ['abcd', 'efgh', 'ghkl', 'ab', 'df'] + }, + Test_find_all{ + "oggi pippo è andato a casa di pluto ed ha trovato pippo", + r"p[iplut]+o", + [5, 10, 31, 36, 51, 56], + ['pippo', 'pluto', 'pippo'] + }, + Test_find_all{ + "oggi pibao è andato a casa di pbababao ed ha trovato pibabababao", + r"(pi?(ba)+o)", + [5, 10, 31, 39, 54, 65], + ['pibao', 'pbababao', 'pibabababao'] + }, + Test_find_all{ + "Today is a good day and tomorrow will be for sure.", + r"[Tt]o\w+", + [0, 5, 24, 32], + ['Today', 'tomorrow'] + }, + Test_find_all{ + "pera\nurl = https://github.com/dario/pig.html\npippo", + r"url *= *https?://[\w./]+", + [5, 44], + ['url = https://github.com/dario/pig.html'] + }, + Test_find_all{ + "pera\nurl = https://github.com/dario/pig.html\npippo", + r"url *= *https?://.*"+'\n', + [5, 45], + ['url = https://github.com/dario/pig.html\n'] + }, + Test_find_all{ + "#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", + r"#[.#]{4}##[.#]{4}##[.#]{4}###", + [29, 49], + ['#....###...##...####'] + }, + Test_find_all{ + "#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", + r".*#[.#]{4}##[.#]{4}##[.#]{4}###", + [0, 49], + ['#.#......##.#..#..##........##....###...##...####'] + }, + Test_find_all{ + "1234 Aa dddd Aaf 12334 Aa opopo Aaf", + r"Aa.+Aaf", + [5, 16, 23, 35], + ['Aa dddd Aaf', 'Aa opopo Aaf'] + }, + Test_find_all{ + "@for something @endfor @for something else @endfor altro testo @for body @endfor uno due @for senza dire più @endfor pippo", + r"@for.+@endfor", + [0, 22, 23, 50, 63, 80, 89, 117], + ['@for something @endfor', '@for something else @endfor', '@for body @endfor', '@for senza dire più @endfor'] + } + +] +) + +const ( + debug = true // true for debug println +) + +fn test_regex(){ + // check capturing groups + for c,to in cgroups_test_suite { + // debug print + if debug { + println("$c [${to.src}] [q${to.q}] (${to.s}, ${to.e})") + } + + mut re := regex.regex_opt(to.q) or { + eprintln('err: $err') + assert false + continue + } + + if to.cgn.len > 0 { + re.group_csave_flag = true + //re.group_csave = [-1].repeat(3*20+1) + if debug { println("continuous save")} + } else { + if debug { println("NO continuous save")} + } + + start, end := re.match_string(to.src) + + mut tmp_str := "" + if start >= 0 && end > start{ + tmp_str = to.src[start..end] + } + + if start != to.s || end != to.e { + //println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") + eprintln("ERROR!") + //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) + assert false + continue + } + + // check cgroups + if to.cgn.len > 0 { + if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] { + eprintln("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}") + assert false + continue + } + + // check captured groups + mut ln := re.group_csave[0]*3 + for ln > 0 { + if re.group_csave[ln] != to.cg[ln] { + eprintln("Capturing group failed on $ln item!") + assert false + } + ln-- + } + + // check named captured groups + for k in to.cgn.keys() { + if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1 + eprintln("Named capturing group error! [$k]") + assert false + continue + } + } + } else { + // check normal captured groups + if re.groups.len != to.cg.len { + assert false + } + for ln:=0; ln < re.groups.len; ln++ { + if re.groups[ln] != to.cg[ln] { + eprintln("Capture group doesn't match:") + eprintln("true ground: [${to.cg}]") + eprintln("elaborated : [${re.groups}]") + assert false + } + } + } + } + + // check find_all + for c,to in find_all_test_suite { + // debug print + if debug { println("#$c [$to.src] q[$to.q] ($to.res, $to.res_str)") } + + mut re := regex.regex_opt(to.q) or { + eprintln('err: $err') + assert false + continue + } + + re.reset() + res := re.find_all(to.src) + if res != to.res { + eprintln('err: find_all !!') + if debug { println("#$c exp: $to.res calculated: $res") } + assert false + } + + res_str := re.find_all_str(to.src) + if res_str != to.res_str { + eprintln('err: find_all_str !!') + if debug { println("#$c exp: $to.res_str calculated: $res_str") } + assert false + } + } + + // check replace + for c,to in match_test_suite_replace{ + // debug print + if debug { println("#$c [$to.src] q[$to.q] $to.r") } + + mut re := regex.regex_opt(to.q) or { + eprintln('err: $err') + assert false + continue + } + + res := re.replace(to.src,to.rep) + if res != to.r { + eprintln("ERROR: replace.") + assert false + continue + } + } + + // check replace simple + for c,to in match_test_suite_replace_simple{ + // debug print + if debug { println("#$c [$to.src] q[$to.q] $to.r") } + + mut re := regex.regex_opt(to.q) or { + eprintln('err: $err') + assert false + continue + } + + res := re.replace_simple(to.src,to.rep) + if res != to.r { + eprintln("ERROR: replace.") + assert false + continue + } + } + + // check match and find + for c,to in match_test_suite { + // debug print + if debug { println("#$c [$to.src] q[$to.q] $to.s $to.e") } + + // test the find + if to.s > 0 { + mut re := regex.regex_opt(to.q) or { + eprintln('err: $err') + assert false + continue + } + // q_str := re.get_query() + // eprintln("Query: $q_str") + start,end := re.find(to.src) + + if start != to.s || end != to.e { + err_str := re.get_parse_error_string(start) + eprintln("ERROR : $err_str start: ${start} end: ${end}") + assert false + } else { + //tmp_str := text[start..end] + //println("found in [$start, $end] => [$tmp_str]") + assert true + } + continue + } + + // test the match + mut re := regex.new() + //re.debug = true + + re.compile_opt(to.q) or { + eprintln('err: $err') + assert false + continue + } + //println("#$c [$to.src] q[$to.q]") + start, end := re.match_string(to.src) + + mut tmp_str := "" + if start >= 0 && end > start{ + tmp_str = to.src[start..end] + } + + if start != to.s || end != to.e { + eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") + eprintln("ERROR!") + //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) + assert false + continue + } + + // rerun to test consistency + tmp_str1 := to.src.clone() + start1, end1 := re.match_string(tmp_str1) + if start1 != start || end1 != end { + eprintln("two run ERROR!!") + assert false + continue + } + + } + + if debug { println("DONE!") } +} + +// test regex_base function +fn test_regex_func(){ + query := r"\d\dabcd" + test_str := "78abcd" + mut re, re_err, err_pos := regex.regex_base(query) + if re_err == regex.compile_ok { + start, end := re.match_string(test_str) + assert (start == 0) && (end == 6) + } else { + eprintln("Error in query string in pos ${err_pos}") + eprintln("Error: ${re.get_parse_error_string(re_err)}") + assert false + } +} + +fn my_repl(re regex.RE, in_txt string, start int, end int) string { + s0 := re.get_group_by_id(in_txt,0)[0..1] + "X" + s1 := re.get_group_by_id(in_txt,1)[0..1] + "X" + s2 := re.get_group_by_id(in_txt,2)[0..1] + "X" + return "${s0}${s1}${s2}" +} + + +// test regex replace function +fn test_regex_func_replace(){ + filler := "E il primo dei tre regni dell'Oltretomba cristiano visitato da Dante nel corso del viaggio, con la guida di Virgilio." + txt := r'"content": "They dont necessarily flag "you will be buying these shares on margin!"", "channel_id"' + query := r'"(content":\s+")(.*)(, "channel_id")' + mut re := regex.regex_opt(query) or { panic(err) } + + mut txt1 := "" + mut txt2 := "" + + for _ in 0..3 { + rnd := int(10+rand.u32() % 20) + txt1 += txt + filler[0..rnd] + "\n" + txt2 += "cXTX,X" + filler[0..rnd] + "\n" + } + + result := re.replace_by_fn(txt1, my_repl) + if debug { + eprintln(result) + eprintln(txt2) + } + assert result == txt2 +}
\ No newline at end of file diff --git a/v_windows/v/vlib/regex/regex_util.v b/v_windows/v/vlib/regex/regex_util.v new file mode 100644 index 0000000..0bf1a81 --- /dev/null +++ b/v_windows/v/vlib/regex/regex_util.v @@ -0,0 +1,436 @@ +/* +regex 1.0 alpha + +Copyright (c) 2019-2021 Dario Deledda. All rights reserved. +Use of this source code is governed by an MIT license +that can be found in the LICENSE file. +*/ +module regex + +import strings + +/****************************************************************************** +* +* Inits +* +******************************************************************************/ +// regex create a regex object from the query string, retunr RE object and errors as re_err, err_pos +pub fn regex_base(pattern string) (RE, int, int) { + // init regex + mut re := RE{} + re.prog = []Token{len: pattern.len + 1} // max program length, can not be longer then the pattern + re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern + re.group_csave_flag = false // enable continuos group saving + re.group_max_nested = 128 // set max 128 group nested + re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth + + re.group_stack = []int{len: re.group_max, init: -1} + re.group_data = []int{len: re.group_max, init: -1} + + re_err, err_pos := re.impl_compile(pattern) + return re, re_err, err_pos +} + +/****************************************************************************** +* +* Utilities +* +******************************************************************************/ +// get_group_bounds_by_name get a group boundaries by its name +pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) { + if group_name in re.group_map { + tmp_index := re.group_map[group_name] - 1 + start := re.groups[tmp_index * 2] + end := re.groups[tmp_index * 2 + 1] + return start, end + } + return -1, -1 +} + +// get_group_by_name get a group boundaries by its name +pub fn (re RE) get_group_by_name(in_txt string, group_name string) string { + if group_name in re.group_map { + tmp_index := re.group_map[group_name] - 1 + start := re.groups[tmp_index * 2] + end := re.groups[tmp_index * 2 + 1] + if start >= 0 && end > start { + return in_txt[start..end] + } + } + return '' +} + +// get_group_by_id get a group string by its id +pub fn (re RE) get_group_by_id(in_txt string, group_id int) string { + if group_id < (re.groups.len >> 1) { + index := group_id << 1 + start := re.groups[index] + end := re.groups[index + 1] + if start >= 0 && end > start { + return in_txt[start..end] + } + } + return '' +} + +// get_group_by_id get a group boundaries by its id +pub fn (re RE) get_group_bounds_by_id(group_id int) (int, int) { + if group_id < re.group_count { + index := group_id << 1 + return re.groups[index], re.groups[index + 1] + } + return -1, -1 +} + +pub struct Re_group { +pub: + start int = -1 + end int = -1 +} + +// get_group_list return a list of Re_group for the found groups +pub fn (re RE) get_group_list() []Re_group { + mut res := []Re_group{len: re.groups.len >> 1} + mut gi := 0 + // println("len: ${re.groups.len} groups: ${re.groups}") + + for gi < re.groups.len { + if re.groups[gi] >= 0 { + txt_st := re.groups[gi] + txt_en := re.groups[gi + 1] + + // println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ") + if txt_st >= 0 && txt_en > txt_st { + tmp := Re_group{ + start: re.groups[gi] + end: re.groups[gi + 1] + } + // println(tmp) + res[gi >> 1] = tmp + } else { + res[gi >> 1] = Re_group{} + } + } + gi += 2 + } + return res +} + +/****************************************************************************** +* +* Matchers +* +******************************************************************************/ +// match_string Match the pattern with the in_txt string +[direct_array_access] +pub fn (mut re RE) match_string(in_txt string) (int, int) { + start, mut end := re.match_base(in_txt.str, in_txt.len + 1) + if end > in_txt.len { + end = in_txt.len + } + + if start >= 0 && end > start { + if (re.flag & f_ms) != 0 && start > 0 { + return no_match_found, 0 + } + if (re.flag & f_me) != 0 && end < in_txt.len { + if in_txt[end] in new_line_list { + return start, end + } + return no_match_found, 0 + } + return start, end + } + return start, end +} + +/****************************************************************************** +* +* Finders +* +******************************************************************************/ +/* +// find internal implementation HERE for reference do not remove!! +[direct_array_access] +fn (mut re RE) find_imp(in_txt string) (int,int) { + old_flag := re.flag + re.flag |= f_src // enable search mode + + start, mut end := re.match_base(in_txt.str, in_txt.len + 1) + //print("Find [$start,$end] '${in_txt[start..end]}'") + if end > in_txt.len { + end = in_txt.len + } + re.flag = old_flag + + if start >= 0 && end > start { + return start, end + } + return no_match_found, 0 +} +*/ + +// find try to find the first match in the input string +[direct_array_access] +pub fn (mut re RE) find(in_txt string) (int, int) { + // old_flag := re.flag + // re.flag |= f_src // enable search mode + + mut i := 0 + for i < in_txt.len { + mut s := -1 + mut e := -1 + unsafe { + // tmp_str := tos(in_txt.str + i, in_txt.len - i) + // println("Check: [$tmp_str]") + s, e = re.match_base(in_txt.str + i, in_txt.len - i + 1) + + if s >= 0 && e > s { + // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]") + // re.flag = old_flag + return i + s, i + e + } + i++ + } + } + // re.flag = old_flag + return -1, -1 +} + +// find try to find the first match in the input string strarting from start index +[direct_array_access] +pub fn (mut re RE) find_from(in_txt string, start int) (int, int) { + old_flag := re.flag + re.flag |= f_src // enable search mode + + mut i := start + if i < 0 { + return -1, -1 + } + for i < in_txt.len { + //--- speed references --- + + mut s := -1 + mut e := -1 + + unsafe { + tmp_str := tos(in_txt.str + i, in_txt.len - i) + s, e = re.match_string(tmp_str) + } + //------------------------ + // s,e = re.find_imp(in_txt[i..]) + //------------------------ + if s >= 0 && e > s { + // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]") + re.flag = old_flag + return i + s, i + e + } else { + i++ + } + } + re.flag = old_flag + return -1, -1 +} + +// find_all find all the non overlapping occurrences of the match pattern +[direct_array_access] +pub fn (mut re RE) find_all(in_txt string) []int { + // old_flag := re.flag + // re.flag |= f_src // enable search mode + + mut i := 0 + mut res := []int{} + + for i < in_txt.len { + mut s := -1 + mut e := -1 + unsafe { + // tmp_str := in_txt[i..] + // tmp_str := tos(in_txt.str + i, in_txt.len - i) + // println("Check: [$tmp_str]") + s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i) + + if s >= 0 && e > s { + res << i + s + res << i + e + i += e + continue + } + } + i++ + } + // re.flag = old_flag + return res +} + +// find_all_str find all the non overlapping occurrences of the match pattern, return a string list +[direct_array_access] +pub fn (mut re RE) find_all_str(in_txt string) []string { + // old_flag := re.flag + // re.flag |= f_src // enable search mode + + mut i := 0 + mut res := []string{} + + for i < in_txt.len { + mut s := -1 + mut e := -1 + unsafe { + // tmp_str := in_txt[i..] + // tmp_str := tos(in_txt.str + i, in_txt.len - i) + // println("Check: [$tmp_str]") + s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i) + + if s >= 0 && e > s { + tmp_str := tos(in_txt.str + i, in_txt.len - i) + // println("Found: $s:$e [${tmp_str[s..e]}]") + res << tmp_str[..e] + i += e + continue + } + } + i++ + } + // re.flag = old_flag + return res +} + +/****************************************************************************** +* +* Replacers +* +******************************************************************************/ +// replace_simple return a string where the matches are replaced with the replace string +pub fn (mut re RE) replace_simple(in_txt string, repl string) string { + pos := re.find_all(in_txt) + + if pos.len > 0 { + mut res := '' + mut i := 0 + + mut s1 := 0 + mut e1 := in_txt.len + + for i < pos.len { + e1 = pos[i] + res += in_txt[s1..e1] + repl + s1 = pos[i + 1] + i += 2 + } + + res += in_txt[s1..] + return res + } + return in_txt +} + +// type of function used for custom replace +// in_txt source text +// start index of the start of the match in in_txt +// end index of the end of the match in in_txt +// the match is in in_txt[start..end] +pub type FnReplace = fn (re RE, in_txt string, start int, end int) string + +// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function +pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string { + mut i := 0 + mut res := strings.new_builder(in_txt.len) + mut last_end := 0 + + for i < in_txt.len { + // println("Find Start. $i [${in_txt[i..]}]") + s, e := re.find_from(in_txt, i) + // println("Find End.") + if s >= 0 && e > s { + // println("find match in: ${s},${e} [${in_txt[s..e]}]") + + if last_end < s { + res.write_string(in_txt[last_end..s]) + } + + for g_i in 0 .. re.group_count { + re.groups[g_i << 1] += i + re.groups[(g_i << 1) + 1] += i + } + + repl := repl_fn(re, in_txt, s, e) + // println("repl res: $repl") + res.write_string(repl) + // res.write_string("[[${in_txt[s..e]}]]") + + last_end = e + i = e + } else { + break + // i++ + } + // println(i) + } + if last_end >= 0 && last_end < in_txt.len { + res.write_string(in_txt[last_end..]) + } + return res.str() +} + +fn (re RE) parsed_replace_string(in_txt string, repl string) string { + str_lst := repl.split('\\') + mut res := str_lst[0] + mut i := 1 + for i < str_lst.len { + tmp := str_lst[i] + // println("tmp: ${tmp}") + if tmp.len > 0 && tmp[0] >= `0` && tmp[0] <= `9` { + group_id := int(tmp[0] - `0`) + group := re.get_group_by_id(in_txt, group_id) + // println("group: $group_id [$group]") + res += '$group${tmp[1..]}' + } else { + res += '\\' + tmp + } + i++ + } + return res +} + +// replace return a string where the matches are replaced with the repl_str string, +// this function support use groups in the replace string +pub fn (mut re RE) replace(in_txt string, repl_str string) string { + mut i := 0 + mut res := strings.new_builder(in_txt.len) + mut last_end := 0 + + for i < in_txt.len { + // println("Find Start. $i [${in_txt[i..]}]") + s, e := re.find_from(in_txt, i) + // println("Find End.") + if s >= 0 && e > s { + // println("find match in: ${s},${e} [${in_txt[s..e]}]") + + if last_end < s { + res.write_string(in_txt[last_end..s]) + } + + for g_i in 0 .. re.group_count { + re.groups[g_i << 1] += i + re.groups[(g_i << 1) + 1] += i + } + + // repl := repl_fn(re, in_txt, s, e) + repl := re.parsed_replace_string(in_txt, repl_str) + // println("repl res: $repl") + res.write_string(repl) + // res.write_string("[[${in_txt[s..e]}]]") + + last_end = e + i = e + } else { + break + // i++ + } + // println(i) + } + if last_end >= 0 && last_end < in_txt.len { + res.write_string(in_txt[last_end..]) + } + return res.str() +} |