Adds most of the toolsHEAD master

author: Indrajith K L 2022-12-03 17:00:20 +0530
committer: Indrajith K L 2022-12-03 17:00:20 +0530
commit: f5c4671bfbad96bf346bd7e9a21fc4317b4959df (patch)
tree: 2764fc62da58f2ba8da7ed341643fc359873142f /v_windows/v/vlib/net/html/parser.v
download: cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.tar.gz
cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.tar.bz2
cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.zip
1 files changed, 260 insertions, 0 deletions
diff --git a/v_windows/v/vlib/net/html/parser.v b/v_windows/v/vlib/net/html/parser.v
new file mode 100644
index 0000000..5b9bbd1
--- /dev/null
+++ b/v_windows/v/vlib/net/html/parser.v
@@ -0,0 +1,260 @@
+module html
+
+import os
+import strings
+
+struct LexicalAttributes {
+mut:
+	current_tag      &Tag
+	open_tag         bool
+	open_code        bool
+	open_string      int
+	open_comment     bool
+	is_attribute     bool
+	opened_code_type string
+	line_count       int
+	lexeme_builder   strings.Builder = strings.new_builder(100)
+	code_tags        map[string]bool = {
+		'script': true
+		'style':  true
+	}
+}
+
+// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
+pub struct Parser {
+mut:
+	dom                DocumentObjectModel
+	lexical_attributes LexicalAttributes = LexicalAttributes{
+		current_tag: &Tag{}
+	}
+	filename    string = 'direct-parse'
+	initialized bool
+	tags        []&Tag
+	debug_file  os.File
+}
+
+// This function is used to add a tag for the parser ignore it's content.
+// For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
+// like `add_code_tag('script')` will make all `script` tags content be jumped,
+// so you still have its content, but will not confuse the parser with it's `>` or `<`.
+pub fn (mut parser Parser) add_code_tag(name string) {
+	if name.len <= 0 {
+		return
+	}
+	parser.lexical_attributes.code_tags[name] = true
+}
+
+[inline]
+fn (parser Parser) builder_str() string {
+	return parser.lexical_attributes.lexeme_builder.after(0)
+}
+
+[if debug]
+fn (mut parser Parser) print_debug(data string) {
+	$if debug {
+		if data.len > 0 {
+			parser.debug_file.writeln(data) or { panic(err) }
+		}
+	}
+}
+
+fn (mut parser Parser) verify_end_comment(remove bool) bool {
+	lexeme := parser.builder_str()
+	last := lexeme[lexeme.len - 1]
+	penultimate := lexeme[lexeme.len - 2]
+	is_end_comment := last == `-` && penultimate == `-`
+	if is_end_comment && remove {
+		parser.lexical_attributes.lexeme_builder.go_back(2)
+	}
+	return is_end_comment
+}
+
+fn blank_string(data string) bool {
+	mut count := 0
+	for chr in data {
+		if chr == 9 || chr == 32 {
+			count++
+		}
+	}
+	return count == data.len
+}
+
+// init initializes the parser.
+fn (mut parser Parser) init() {
+	if parser.initialized {
+		return
+	}
+	parser.dom = DocumentObjectModel{
+		debug_file: parser.debug_file
+		root: &Tag{}
+	}
+	parser.add_code_tag('')
+	parser.tags = []&Tag{}
+	parser.dom.close_tags['/!document'] = true
+	parser.lexical_attributes.current_tag = &Tag{}
+	parser.initialized = true
+}
+
+fn (mut parser Parser) generate_tag() {
+	if parser.lexical_attributes.open_tag {
+		return
+	}
+	if parser.lexical_attributes.current_tag.name.len > 0
+		|| parser.lexical_attributes.current_tag.content.len > 0 {
+		parser.tags << parser.lexical_attributes.current_tag
+	}
+	parser.lexical_attributes.current_tag = &Tag{}
+}
+
+// split_parse parses the HTML fragment
+pub fn (mut parser Parser) split_parse(data string) {
+	parser.init()
+	for chr in data {
+		// returns true if byte is a " or '
+		is_quote := chr == `"` || chr == `'`
+		string_code := match chr {
+			`"` { 1 } // "
+			`'` { 2 } // '
+			else { 0 }
+		}
+		if parser.lexical_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
+			parser.lexical_attributes.lexeme_builder.write_b(chr)
+			if parser.lexical_attributes.open_string > 0
+				&& parser.lexical_attributes.open_string == string_code {
+				parser.lexical_attributes.open_string = 0
+			} else if is_quote {
+				parser.lexical_attributes.open_string = string_code
+			} else if chr == `>` { // only execute verification if is a > // here will verify < to know if code tag is finished
+				name_close_tag := '</$parser.lexical_attributes.opened_code_type>'
+				if parser.builder_str().to_lower().ends_with(name_close_tag) {
+					parser.lexical_attributes.open_code = false
+					// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom)
+					parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
+					parser.lexical_attributes.current_tag.closed = true
+					parser.lexical_attributes.current_tag.close_type = .new_tag
+				}
+			}
+		} else if parser.lexical_attributes.open_comment {
+			if chr == `>` && parser.verify_end_comment(false) { // close tag '>'
+				// parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
+				parser.lexical_attributes.lexeme_builder.go_back_to(0)
+				parser.lexical_attributes.open_comment = false
+				parser.lexical_attributes.open_tag = false
+			} else {
+				parser.lexical_attributes.lexeme_builder.write_b(chr)
+			}
+		} else if parser.lexical_attributes.open_string > 0 {
+			if parser.lexical_attributes.open_string == string_code {
+				parser.lexical_attributes.open_string = 0
+				parser.lexical_attributes.lexeme_builder.write_b(chr)
+				temp_lexeme := parser.builder_str()
+				if parser.lexical_attributes.current_tag.last_attribute != '' {
+					lattr := parser.lexical_attributes.current_tag.last_attribute
+					nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
+					// parser.print_debug(lattr + " = " + temp_lexeme)
+					parser.lexical_attributes.current_tag.attributes[lattr] = nval
+					parser.lexical_attributes.current_tag.last_attribute = ''
+				} else {
+					parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
+				}
+				parser.lexical_attributes.lexeme_builder.go_back_to(0)
+			} else {
+				parser.lexical_attributes.lexeme_builder.write_b(chr)
+			}
+		} else if parser.lexical_attributes.open_tag {
+			if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
+				parser.lexical_attributes.open_string = string_code
+				parser.lexical_attributes.lexeme_builder.write_b(chr)
+			} else if chr == `>` { // close tag >
+				complete_lexeme := parser.builder_str().to_lower()
+				parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0
+					&& complete_lexeme[complete_lexeme.len - 1] == `/`) // if equals to /
+				if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
+					parser.dom.close_tags[complete_lexeme] = true
+				}
+				/*
+				else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
+					parser.lexical_attributes.current_tag.closed = true
+				}
+				*/
+				if parser.lexical_attributes.current_tag.name == '' {
+					parser.lexical_attributes.current_tag.name = complete_lexeme
+				} else if complete_lexeme != '/' {
+					parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
+				}
+				parser.lexical_attributes.open_tag = false
+				parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
+				if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
+					parser.lexical_attributes.open_code = true
+					parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
+				}
+				// parser.print_debug(parser.lexical_attributes.current_tag.name)
+			} else if chr !in [byte(9), ` `, `=`, `\n`] { // Tab, space, = and \n
+				parser.lexical_attributes.lexeme_builder.write_b(chr)
+			} else if chr != 10 {
+				complete_lexeme := parser.builder_str().to_lower()
+				if parser.lexical_attributes.current_tag.name == '' {
+					parser.lexical_attributes.current_tag.name = complete_lexeme
+				} else {
+					parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
+					parser.lexical_attributes.current_tag.last_attribute = ''
+					if chr == `=` { // if was a =
+						parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
+					}
+				}
+				parser.lexical_attributes.lexeme_builder.go_back_to(0)
+			}
+			if parser.builder_str() == '!--' {
+				parser.lexical_attributes.open_comment = true
+			}
+		} else if chr == `<` { // open tag '<'
+			temp_string := parser.builder_str()
+			if parser.lexical_attributes.lexeme_builder.len >= 1 {
+				if parser.lexical_attributes.current_tag.name.len > 1
+					&& parser.lexical_attributes.current_tag.name[0] == 47
+					&& !blank_string(temp_string) {
+					parser.tags << &Tag{
+						name: 'text'
+						content: temp_string
+					}
+				} else {
+					parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
+				}
+			}
+			// parser.print_debug(parser.lexical_attributes.current_tag.str())
+			parser.lexical_attributes.lexeme_builder.go_back_to(0)
+			parser.generate_tag()
+			parser.lexical_attributes.open_tag = true
+		} else {
+			parser.lexical_attributes.lexeme_builder.write_b(chr)
+		}
+	}
+}
+
+// parse_html parses the given HTML string
+pub fn (mut parser Parser) parse_html(data string) {
+	parser.init()
+	mut lines := data.split_into_lines()
+	for line in lines {
+		parser.lexical_attributes.line_count++
+		parser.split_parse(line)
+	}
+	parser.generate_tag()
+	parser.dom.debug_file = parser.debug_file
+	parser.dom.construct(parser.tags)
+}
+
+// finalize finishes the parsing stage .
+[inline]
+pub fn (mut parser Parser) finalize() {
+	parser.generate_tag()
+}
+
+// get_dom returns the parser's current DOM representation.
+pub fn (mut parser Parser) get_dom() DocumentObjectModel {
+	if !parser.dom.constructed {
+		parser.generate_tag()
+		parser.dom.construct(parser.tags)
+	}
+	return parser.dom
+}
author	Indrajith K L	2022-12-03 17:00:20 +0530
committer	Indrajith K L	2022-12-03 17:00:20 +0530
commit	f5c4671bfbad96bf346bd7e9a21fc4317b4959df (patch)
tree	2764fc62da58f2ba8da7ed341643fc359873142f /v_windows/v/vlib/net/html/parser.v
download	cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.tar.gz cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.tar.bz2 cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.zip