diff options
Diffstat (limited to 'v_windows/v/vlib/x/json2/scanner.v')
-rw-r--r-- | v_windows/v/vlib/x/json2/scanner.v | 306 |
1 files changed, 306 insertions, 0 deletions
diff --git a/v_windows/v/vlib/x/json2/scanner.v b/v_windows/v/vlib/x/json2/scanner.v new file mode 100644 index 0000000..7956258 --- /dev/null +++ b/v_windows/v/vlib/x/json2/scanner.v @@ -0,0 +1,306 @@ +// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module json2 + +import strconv + +struct Scanner { +mut: + text []byte + pos int + line int + col int +} + +enum TokenKind { + none_ + error + str_ + float + int_ + null + bool_ + eof + comma = 44 + colon = 58 + lsbr = 91 + rsbr = 93 + lcbr = 123 + rcbr = 125 +} + +struct Token { + lit []byte + kind TokenKind + line int + col int +} + +const ( + // list of characters commonly used in JSON. + char_list = [`{`, `}`, `[`, `]`, `,`, `:`] + // list of newlines to check when moving to a new position. + newlines = [`\r`, `\n`, `\t`] + // list of escapable that needs to be escaped inside a JSON string. + // double quotes and forward slashes are excluded intentionally since + // they have their own separate checks for it in order to pass the + // JSON test suite (https://github.com/nst/JSONTestSuite/). + important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`] + // list of valid unicode escapes aside from \u{4-hex digits} + valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`] + // used for transforming escapes into valid unicode (eg. n => \n) + unicode_transform_escapes = { + 98: `\b` + 102: `\f` + 110: `\n` + 114: `\r` + 116: `\t` + 92: `\\` + 34: `"` + 47: `/` + } + exp_signs = [byte(`-`), `+`] +) + +// move_pos proceeds to the next position. +fn (mut s Scanner) move() { + s.move_pos(true, true) +} + +// move_pos_with_newlines is the same as move_pos but only enables newline checking. +fn (mut s Scanner) move_pos_with_newlines() { + s.move_pos(false, true) +} + +fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) { + s.pos++ + if s.pos < s.text.len { + if include_newlines && s.text[s.pos] in json2.newlines { + s.line++ + s.col = 0 + if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` { + s.pos++ + } + for s.pos < s.text.len && s.text[s.pos] in json2.newlines { + s.move() + } + } else if include_space && s.text[s.pos] == ` ` { + s.pos++ + s.col++ + for s.pos < s.text.len && s.text[s.pos] == ` ` { + s.move() + } + } + } else { + s.col++ + } +} + +// error returns an error token. +fn (s Scanner) error(description string) Token { + return s.tokenize(description.bytes(), .error) +} + +// tokenize returns a token based on the given lit and kind. +fn (s Scanner) tokenize(lit []byte, kind TokenKind) Token { + return Token{ + lit: lit + kind: kind + col: s.col + line: s.line + } +} + +// text_scan scans and returns a string token. +[manualfree] +fn (mut s Scanner) text_scan() Token { + mut has_closed := false + mut chrs := []byte{} + for { + s.pos++ + s.col++ + if s.pos >= s.text.len { + break + } + ch := s.text[s.pos] + if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` { + has_closed = true + break + } else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) + && ch in json2.important_escapable_chars { + return s.error('character must be escaped with a backslash') + } else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) { + return s.error('invalid backslash escape') + } else if s.pos + 1 < s.text.len && ch == `\\` { + peek := s.text[s.pos + 1] + if peek in json2.valid_unicode_escapes { + chrs << json2.unicode_transform_escapes[int(peek)] + s.pos++ + s.col++ + continue + } else if peek == `u` { + if s.pos + 5 < s.text.len { + s.pos++ + s.col++ + mut codepoint := []byte{} + codepoint_start := s.pos + for s.pos < s.text.len && s.pos < codepoint_start + 4 { + s.pos++ + s.col++ + if s.text[s.pos] == `"` { + break + } else if !s.text[s.pos].is_hex_digit() { + x := s.text[s.pos].ascii_str() + return s.error('`$x` is not a hex digit') + } + codepoint << s.text[s.pos] + } + if codepoint.len != 4 { + return s.error('unicode escape must have 4 hex digits') + } + val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 }) + converted := utf32_to_str(val) + converted_bytes := converted.bytes() + chrs << converted_bytes + unsafe { + converted.free() + converted_bytes.free() + codepoint.free() + } + continue + } else { + return s.error('incomplete unicode escape') + } + } else if peek == `U` { + return s.error('unicode endpoints must be in lowercase `u`') + } else if peek == byte(229) { + return s.error('unicode endpoint not allowed') + } else { + return s.error('invalid backslash escape') + } + } + chrs << ch + } + tok := s.tokenize(chrs, .str_) + s.move() + if !has_closed { + return s.error('missing double quotes in string closing') + } + return tok +} + +// num_scan scans and returns an int/float token. +fn (mut s Scanner) num_scan() Token { + // analyze json number structure + // -[digit][?[dot][digit]][?[E/e][?-/+][digit]] + mut is_fl := false + mut dot_index := -1 + mut digits := []byte{} + if s.text[s.pos] == `-` { + digits << `-` + if !s.text[s.pos + 1].is_digit() { + return s.invalid_token() + } + s.move_pos_with_newlines() + } + if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) { + return s.error('leading zeroes in a number are not allowed') + } + for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) { + digits << s.text[s.pos] + if s.text[s.pos] == `.` { + is_fl = true + dot_index = digits.len - 1 + } + s.move_pos_with_newlines() + } + if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 { + return s.error('invalid float') + } + if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) { + digits << s.text[s.pos] + s.move_pos_with_newlines() + if s.pos < s.text.len && s.text[s.pos] in json2.exp_signs { + digits << s.text[s.pos] + s.move_pos_with_newlines() + } + mut exp_digits_count := 0 + for s.pos < s.text.len && s.text[s.pos].is_digit() { + digits << s.text[s.pos] + exp_digits_count++ + s.move_pos_with_newlines() + } + if exp_digits_count == 0 { + return s.error('invalid exponent') + } + } + kind := if is_fl { TokenKind.float } else { TokenKind.int_ } + return s.tokenize(digits, kind) +} + +// invalid_token returns an error token with the invalid token message. +fn (s Scanner) invalid_token() Token { + if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 { + x := s.text[s.pos].ascii_str() + return s.error('invalid token `$x`') + } else { + x := s.text[s.pos].str_escaped() + return s.error('invalid token `$x`') + } +} + +// scan returns a token based on the scanner's current position. +[manualfree] +fn (mut s Scanner) scan() Token { + if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in json2.newlines) { + s.move() + } + if s.pos >= s.text.len { + return s.tokenize([]byte{}, .eof) + } else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) { + ident := s.text[s.pos..s.pos + 4].bytestr() + if ident == 'true' || ident == 'null' { + mut kind := TokenKind.null + if ident == 'true' { + kind = .bool_ + } + unsafe { ident.free() } + val := s.text[s.pos..s.pos + 4] + tok := s.tokenize(val, kind) + s.move() // n / t + s.move() // u / r + s.move() // l / u + s.move() // l / e + return tok + } + unsafe { ident.free() } + return s.invalid_token() + } else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` { + ident := s.text[s.pos..s.pos + 5].bytestr() + if ident == 'false' { + unsafe { ident.free() } + val := s.text[s.pos..s.pos + 5] + tok := s.tokenize(val, .bool_) + s.move() // f + s.move() // a + s.move() // l + s.move() // s + s.move() // e + return tok + } + unsafe { ident.free() } + return s.invalid_token() + } else if s.text[s.pos] in json2.char_list { + chr := s.text[s.pos] + tok := s.tokenize([]byte{}, TokenKind(int(chr))) + s.move() + return tok + } else if s.text[s.pos] == `"` { + return s.text_scan() + } else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` { + return s.num_scan() + } else { + return s.invalid_token() + } +} |