diff options
| author | Indrajith K L | 2022-12-03 17:00:20 +0530 | 
|---|---|---|
| committer | Indrajith K L | 2022-12-03 17:00:20 +0530 | 
| commit | f5c4671bfbad96bf346bd7e9a21fc4317b4959df (patch) | |
| tree | 2764fc62da58f2ba8da7ed341643fc359873142f /v_windows/v/old/vlib/net/html | |
| download | cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.tar.gz cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.tar.bz2 cli-tools-windows-f5c4671bfbad96bf346bd7e9a21fc4317b4959df.zip | |
Diffstat (limited to 'v_windows/v/old/vlib/net/html')
| -rw-r--r-- | v_windows/v/old/vlib/net/html/README.md | 16 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/data_structures.v | 91 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/dom.v | 189 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/dom_test.v | 56 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/html.v | 18 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/html_test.v | 15 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/parser.v | 260 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/parser_test.v | 41 | ||||
| -rw-r--r-- | v_windows/v/old/vlib/net/html/tag.v | 68 | 
9 files changed, 754 insertions, 0 deletions
| diff --git a/v_windows/v/old/vlib/net/html/README.md b/v_windows/v/old/vlib/net/html/README.md new file mode 100644 index 0000000..a92a6e6 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/README.md @@ -0,0 +1,16 @@ +net/http is an HTML written in pure V. + +## Usage +```v oksyntax +import net.html + +fn main() { +	doc := html.parse('<html><body><h1 class="title">Hello world!</h1></body></html>') +	tag := doc.get_tag('h1')[0] // <h1>Hello world!</h1> +	println(tag.name) // h1 +	println(tag.content) // Hello world! +	println(tag.attributes) // {'class':'title'} +	println(tag.str()) // <h1 class="title">Hello world!</h1> +} +``` +More examples found on [`parser_test.v`](parser_test.v) and [`html_test.v`](html_test.v) diff --git a/v_windows/v/old/vlib/net/html/data_structures.v b/v_windows/v/old/vlib/net/html/data_structures.v new file mode 100644 index 0000000..688b756 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/data_structures.v @@ -0,0 +1,91 @@ +module html + +const ( +	null_element = int(0x80000000) +) + +struct Stack { +mut: +	elements []int +	size     int +} + +[inline] +fn is_null(data int) bool { +	return data == html.null_element +} + +[inline] +fn (stack Stack) is_empty() bool { +	return stack.size <= 0 +} + +fn (stack Stack) peek() int { +	return if !stack.is_empty() { stack.elements[stack.size - 1] } else { html.null_element } +} + +fn (mut stack Stack) pop() int { +	mut to_return := html.null_element +	if !stack.is_empty() { +		to_return = stack.elements[stack.size - 1] +		stack.size-- +	} +	return to_return +} + +fn (mut stack Stack) push(item int) { +	if stack.elements.len > stack.size { +		stack.elements[stack.size] = item +	} else { +		stack.elements << item +	} +	stack.size++ +} + +struct BTree { +mut: +	all_tags     []Tag +	node_pointer int +	childrens    [][]int +	parents      []int +} + +fn (mut btree BTree) add_children(tag Tag) int { +	btree.all_tags << tag +	if btree.all_tags.len > 1 { +		for btree.childrens.len <= btree.node_pointer { +			mut temp_array := btree.childrens +			temp_array << []int{} +			btree.childrens = temp_array +		} +		btree.childrens[btree.node_pointer] << btree.all_tags.len - 1 +		for btree.parents.len < btree.all_tags.len { +			mut temp_array := btree.parents +			temp_array << 0 +			btree.parents = temp_array +		} +		btree.parents[btree.all_tags.len - 1] = btree.node_pointer +	} +	return btree.all_tags.len - 1 +} + +[inline] +fn (btree BTree) get_children() []int { +	return btree.childrens[btree.node_pointer] +} + +[inline] +fn (btree BTree) get_parent() int { +	return btree.parents[btree.node_pointer] +} + +[inline] +fn (btree BTree) get_stored() Tag { +	return btree.all_tags[btree.node_pointer] +} + +fn (mut btree BTree) move_pointer(to int) { +	if to < btree.all_tags.len { +		btree.node_pointer = to +	} +} diff --git a/v_windows/v/old/vlib/net/html/dom.v b/v_windows/v/old/vlib/net/html/dom.v new file mode 100644 index 0000000..f56e9c2 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/dom.v @@ -0,0 +1,189 @@ +module html + +import os + +// The W3C Document Object Model (DOM) is a platform and language-neutral +// interface that allows programs and scripts to dynamically access and +// update the content, structure, and style of a document. +// +// https://www.w3.org/TR/WD-DOM/introduction.html +pub struct DocumentObjectModel { +mut: +	root           &Tag +	constructed    bool +	btree          BTree +	all_tags       []&Tag +	all_attributes map[string][]&Tag +	close_tags     map[string]bool // add a counter to see count how many times is closed and parse correctly +	attributes     map[string][]string +	tag_attributes map[string][][]&Tag +	tag_type       map[string][]&Tag +	debug_file     os.File +} + +[if debug] +fn (mut dom DocumentObjectModel) print_debug(data string) { +	$if debug { +		if data.len > 0 { +			dom.debug_file.writeln(data) or { panic(err) } +		} +	} +} + +[inline] +fn is_close_tag(tag &Tag) bool { +	return tag.name.len > 0 && tag.name[0] == `/` +} + +fn (mut dom DocumentObjectModel) where_is(item_name string, attribute_name string) int { +	if attribute_name !in dom.attributes { +		dom.attributes[attribute_name] = []string{} +	} +	mut string_array := dom.attributes[attribute_name] +	mut counter := 0 +	for value in string_array { +		if value == item_name { +			return counter +		} +		counter++ +	} +	string_array << item_name +	dom.attributes[attribute_name] = string_array +	return string_array.len - 1 +} + +fn (mut dom DocumentObjectModel) add_tag_attribute(tag &Tag) { +	for attribute_name, _ in tag.attributes { +		attribute_value := tag.attributes[attribute_name] +		location := dom.where_is(attribute_value, attribute_name) +		if attribute_name !in dom.tag_attributes { +			dom.tag_attributes[attribute_name] = [] +		} +		for { +			mut temp_array := dom.tag_attributes[attribute_name] +			temp_array << []&Tag{} +			dom.tag_attributes[attribute_name] = temp_array +			if location < dom.tag_attributes[attribute_name].len + 1 { +				break +			} +		} +		mut temp_array := dom.tag_attributes[attribute_name][location] +		temp_array << tag +		dom.tag_attributes[attribute_name][location] = temp_array +	} +} + +fn (mut dom DocumentObjectModel) add_tag_by_type(tag &Tag) { +	tag_name := tag.name +	if !(tag_name in dom.tag_type) { +		dom.tag_type[tag_name] = [tag] +	} else { +		mut temp_array := dom.tag_type[tag_name] +		temp_array << tag +		dom.tag_type[tag_name] = temp_array +	} +} + +fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) { +	for attribute_name in tag.attributes.keys() { +		if attribute_name !in dom.all_attributes { +			dom.all_attributes[attribute_name] = [tag] +		} else { +			mut temp_array := dom.all_attributes[attribute_name] +			temp_array << tag +			dom.all_attributes[attribute_name] = temp_array +		} +	} +} + +fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) { +	dom.constructed = true +	mut temp_map := map[string]int{} +	mut temp_int := null_element +	mut temp_string := '' +	mut stack := Stack{} +	dom.btree = BTree{} +	dom.root = tag_list[0] +	dom.all_tags = [tag_list[0]] +	temp_map['0'] = dom.btree.add_children(tag_list[0]) +	stack.push(0) +	root_index := 0 +	for index := 1; index < tag_list.len; index++ { +		mut tag := tag_list[index] +		dom.print_debug(tag.str()) +		if is_close_tag(tag) { +			temp_int = stack.peek() +			temp_string = tag.name[1..] +			for !is_null(temp_int) && temp_string != tag_list[temp_int].name +				&& !tag_list[temp_int].closed { +				dom.print_debug(temp_string + ' >> ' + tag_list[temp_int].name + ' ' + +					(temp_string == tag_list[temp_int].name).str()) +				stack.pop() +				temp_int = stack.peek() +			} +			temp_int = stack.peek() +			temp_int = if !is_null(temp_int) { stack.pop() } else { root_index } +			if is_null(temp_int) { +				stack.push(root_index) +			} +			dom.print_debug('Removed ' + temp_string + ' -- ' + tag_list[temp_int].name) +		} else if tag.name.len > 0 { +			dom.add_tag_attribute(tag) // error here +			dom.add_tag_by_attribute(tag) +			dom.add_tag_by_type(tag) +			dom.all_tags << tag +			temp_int = stack.peek() +			if !is_null(temp_int) { +				dom.btree.move_pointer(temp_map[temp_int.str()]) +				temp_map[index.str()] = dom.btree.add_children(tag) +				mut temp_tag := tag_list[temp_int] +				position_in_parent := temp_tag.add_child(tag) // tag_list[temp_int] = temp_tag +				tag.add_parent(temp_tag, position_in_parent) +				/* +				dom.print_debug("Added ${tag.name} as child of '" + tag_list[temp_int].name + +					"' which now has ${dom.btree.get_children().len} childrens") +				*/ +				dom.print_debug("Added $tag.name as child of '" + temp_tag.name + +					"' which now has $temp_tag.children.len childrens") +			} else { // dom.new_root(tag) +				stack.push(root_index) +			} +			temp_string = '/' + tag.name +			if temp_string in dom.close_tags && !tag.closed { // if tag ends with /> +				dom.print_debug('Pushed ' + temp_string) +				stack.push(index) +			} +		} +	} // println(tag_list[root_index]) for debug purposes +	dom.root = tag_list[0] +} + +// get_tag_by_attribute_value retrieves all the tags in the document that has the given attribute name and value. +pub fn (mut dom DocumentObjectModel) get_tag_by_attribute_value(name string, value string) []&Tag { +	location := dom.where_is(value, name) +	return if dom.tag_attributes[name].len > location { +		dom.tag_attributes[name][location] +	} else { +		[]&Tag{} +	} +} + +// get_tag retrieves all the tags in the document that has the given tag name. +pub fn (dom DocumentObjectModel) get_tag(name string) []&Tag { +	return if name in dom.tag_type { dom.tag_type[name] } else { []&Tag{} } +} + +// get_tag_by_attribute retrieves all the tags in the document that has the given attribute name. +pub fn (dom DocumentObjectModel) get_tag_by_attribute(name string) []&Tag { +	return if name in dom.all_attributes { dom.all_attributes[name] } else { []&Tag{} } +} + +// get_root returns the root of the document. +pub fn (dom DocumentObjectModel) get_root() &Tag { +	return dom.root +} + +// get_tags returns all of the tags stored in the document. +pub fn (dom DocumentObjectModel) get_tags() []&Tag { +	return dom.all_tags +} diff --git a/v_windows/v/old/vlib/net/html/dom_test.v b/v_windows/v/old/vlib/net/html/dom_test.v new file mode 100644 index 0000000..d4fd292 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/dom_test.v @@ -0,0 +1,56 @@ +module html + +import strings + +fn generate_temp_html() string { +	mut temp_html := strings.new_builder(200) +	temp_html.write_string('<!doctype html><html><head><title>Giant String</title></head><body>') +	for counter := 0; counter < 4; counter++ { +		temp_html.write_string("<div id='name_$counter' ") +		temp_html.write_string("class='several-$counter'>Look at $counter</div>") +	} +	temp_html.write_string('</body></html>') +	return temp_html.str() +} + +fn test_search_by_tag_type() { +	dom := parse(generate_temp_html()) +	assert dom.get_tag('div').len == 4 +	assert dom.get_tag('head').len == 1 +	assert dom.get_tag('body').len == 1 +} + +fn test_search_by_attribute_value() { +	mut dom := parse(generate_temp_html()) +	// println(temp_html) +	print('Amount ') +	println(dom.get_tag_by_attribute_value('id', 'name_0')) +	assert dom.get_tag_by_attribute_value('id', 'name_0').len == 1 +} + +fn test_access_parent() { +	mut dom := parse(generate_temp_html()) +	div_tags := dom.get_tag('div') +	parent := div_tags[0].parent +	assert parent != 0 +	for div_tag in div_tags { +		assert div_tag.parent == parent +	} +} + +fn test_search_by_attributes() { +	dom := parse(generate_temp_html()) +	assert dom.get_tag_by_attribute('id').len == 4 +} + +fn test_tags_used() { +	dom := parse(generate_temp_html()) +	assert dom.get_tags().len == 9 +} + +fn test_access_tag_fields() { +	dom := parse(generate_temp_html()) +	id_tags := dom.get_tag_by_attribute('id') +	assert id_tags[0].name == 'div' +	assert id_tags[1].attributes['class'] == 'several-1' +} diff --git a/v_windows/v/old/vlib/net/html/html.v b/v_windows/v/old/vlib/net/html/html.v new file mode 100644 index 0000000..293b643 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/html.v @@ -0,0 +1,18 @@ +module html + +import os + +// parse parses and returns the DOM from the given text. +pub fn parse(text string) DocumentObjectModel { +	mut parser := Parser{} +	parser.parse_html(text) +	return parser.get_dom() +} + +// parse_file parses and returns the DOM from the contents of a file. +pub fn parse_file(filename string) DocumentObjectModel { +	content := os.read_file(filename) or { return DocumentObjectModel{ +		root: &Tag{} +	} } +	return parse(content) +} diff --git a/v_windows/v/old/vlib/net/html/html_test.v b/v_windows/v/old/vlib/net/html/html_test.v new file mode 100644 index 0000000..51271cd --- /dev/null +++ b/v_windows/v/old/vlib/net/html/html_test.v @@ -0,0 +1,15 @@ +module html + +fn test_parse() { +	doc := parse('<html><body><h1 class="title">Hello world!</h1></body></html>') +	tags := doc.get_tag('h1') +	assert tags.len == 1 +	h1_tag := tags[0] // <h1>Hello world!</h1> +	assert h1_tag.name == 'h1' +	assert h1_tag.content == 'Hello world!' +	assert h1_tag.attributes.len == 2 +	// TODO: do not remove. Attributes must not have an empty attr. +	// assert h1_tag.attributes.len == 1 +	assert h1_tag.str() == '<h1 class="title" >Hello world!</h1>' +	// assert h1_tag.str() == '<h1 class="title">Hello world!</h1>' +} diff --git a/v_windows/v/old/vlib/net/html/parser.v b/v_windows/v/old/vlib/net/html/parser.v new file mode 100644 index 0000000..b9ad2a1 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/parser.v @@ -0,0 +1,260 @@ +module html + +import os +import strings + +struct LexicalAttributes { +mut: +	current_tag      &Tag +	open_tag         bool +	open_code        bool +	open_string      int +	open_comment     bool +	is_attribute     bool +	opened_code_type string +	line_count       int +	lexeme_builder   strings.Builder = strings.new_builder(100) +	code_tags        map[string]bool = map{ +		'script': true +		'style':  true +	} +} + +// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`. +pub struct Parser { +mut: +	dom                DocumentObjectModel +	lexical_attributes LexicalAttributes = LexicalAttributes{ +		current_tag: &Tag{} +	} +	filename    string = 'direct-parse' +	initialized bool +	tags        []&Tag +	debug_file  os.File +} + +// This function is used to add a tag for the parser ignore it's content. +// For example, if you have an html or XML with a custom tag, like `<script>`, using this function, +// like `add_code_tag('script')` will make all `script` tags content be jumped, +// so you still have its content, but will not confuse the parser with it's `>` or `<`. +pub fn (mut parser Parser) add_code_tag(name string) { +	if name.len <= 0 { +		return +	} +	parser.lexical_attributes.code_tags[name] = true +} + +[inline] +fn (parser Parser) builder_str() string { +	return parser.lexical_attributes.lexeme_builder.after(0) +} + +[if debug] +fn (mut parser Parser) print_debug(data string) { +	$if debug { +		if data.len > 0 { +			parser.debug_file.writeln(data) or { panic(err) } +		} +	} +} + +fn (mut parser Parser) verify_end_comment(remove bool) bool { +	lexeme := parser.builder_str() +	last := lexeme[lexeme.len - 1] +	penultimate := lexeme[lexeme.len - 2] +	is_end_comment := last == `-` && penultimate == `-` +	if is_end_comment && remove { +		parser.lexical_attributes.lexeme_builder.go_back(2) +	} +	return is_end_comment +} + +fn blank_string(data string) bool { +	mut count := 0 +	for chr in data { +		if chr == 9 || chr == 32 { +			count++ +		} +	} +	return count == data.len +} + +// init initializes the parser. +fn (mut parser Parser) init() { +	if parser.initialized { +		return +	} +	parser.dom = DocumentObjectModel{ +		debug_file: parser.debug_file +		root: &Tag{} +	} +	parser.add_code_tag('') +	parser.tags = []&Tag{} +	parser.dom.close_tags['/!document'] = true +	parser.lexical_attributes.current_tag = &Tag{} +	parser.initialized = true +} + +fn (mut parser Parser) generate_tag() { +	if parser.lexical_attributes.open_tag { +		return +	} +	if parser.lexical_attributes.current_tag.name.len > 0 +		|| parser.lexical_attributes.current_tag.content.len > 0 { +		parser.tags << parser.lexical_attributes.current_tag +	} +	parser.lexical_attributes.current_tag = &Tag{} +} + +// split_parse parses the HTML fragment +pub fn (mut parser Parser) split_parse(data string) { +	parser.init() +	for chr in data { +		// returns true if byte is a " or ' +		is_quote := chr == `"` || chr == `\'` +		string_code := match chr { +			`"` { 1 } // " +			`\'` { 2 } // ' +			else { 0 } +		} +		if parser.lexical_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code +			parser.lexical_attributes.lexeme_builder.write_b(chr) +			if parser.lexical_attributes.open_string > 0 +				&& parser.lexical_attributes.open_string == string_code { +				parser.lexical_attributes.open_string = 0 +			} else if is_quote { +				parser.lexical_attributes.open_string = string_code +			} else if chr == `>` { // only execute verification if is a > // here will verify < to know if code tag is finished +				name_close_tag := '</$parser.lexical_attributes.opened_code_type>' +				if parser.builder_str().to_lower().ends_with(name_close_tag) { +					parser.lexical_attributes.open_code = false +					// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom) +					parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len) +					parser.lexical_attributes.current_tag.closed = true +					parser.lexical_attributes.current_tag.close_type = .new_tag +				} +			} +		} else if parser.lexical_attributes.open_comment { +			if chr == `>` && parser.verify_end_comment(false) { // close tag '>' +				// parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str()) +				parser.lexical_attributes.lexeme_builder.go_back_to(0) +				parser.lexical_attributes.open_comment = false +				parser.lexical_attributes.open_tag = false +			} else { +				parser.lexical_attributes.lexeme_builder.write_b(chr) +			} +		} else if parser.lexical_attributes.open_string > 0 { +			if parser.lexical_attributes.open_string == string_code { +				parser.lexical_attributes.open_string = 0 +				parser.lexical_attributes.lexeme_builder.write_b(chr) +				temp_lexeme := parser.builder_str() +				if parser.lexical_attributes.current_tag.last_attribute != '' { +					lattr := parser.lexical_attributes.current_tag.last_attribute +					nval := temp_lexeme.substr(1, temp_lexeme.len - 1) +					// parser.print_debug(lattr + " = " + temp_lexeme) +					parser.lexical_attributes.current_tag.attributes[lattr] = nval +					parser.lexical_attributes.current_tag.last_attribute = '' +				} else { +					parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme) +				} +				parser.lexical_attributes.lexeme_builder.go_back_to(0) +			} else { +				parser.lexical_attributes.lexeme_builder.write_b(chr) +			} +		} else if parser.lexical_attributes.open_tag { +			if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote { +				parser.lexical_attributes.open_string = string_code +				parser.lexical_attributes.lexeme_builder.write_b(chr) +			} else if chr == `>` { // close tag > +				complete_lexeme := parser.builder_str().to_lower() +				parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0 +					&& complete_lexeme[complete_lexeme.len - 1] == `/`) // if equals to / +				if complete_lexeme.len > 0 && complete_lexeme[0] == `/` { +					parser.dom.close_tags[complete_lexeme] = true +				} +				/* +				else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>" +					parser.lexical_attributes.current_tag.closed = true +				} +				*/ +				if parser.lexical_attributes.current_tag.name == '' { +					parser.lexical_attributes.current_tag.name = complete_lexeme +				} else if complete_lexeme != '/' { +					parser.lexical_attributes.current_tag.attributes[complete_lexeme] = '' +				} +				parser.lexical_attributes.open_tag = false +				parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code +				if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags { +					parser.lexical_attributes.open_code = true +					parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name +				} +				// parser.print_debug(parser.lexical_attributes.current_tag.name) +			} else if chr !in [byte(9), ` `, `=`, `\n`] { // Tab, space, = and \n +				parser.lexical_attributes.lexeme_builder.write_b(chr) +			} else if chr != 10 { +				complete_lexeme := parser.builder_str().to_lower() +				if parser.lexical_attributes.current_tag.name == '' { +					parser.lexical_attributes.current_tag.name = complete_lexeme +				} else { +					parser.lexical_attributes.current_tag.attributes[complete_lexeme] = '' +					parser.lexical_attributes.current_tag.last_attribute = '' +					if chr == `=` { // if was a = +						parser.lexical_attributes.current_tag.last_attribute = complete_lexeme +					} +				} +				parser.lexical_attributes.lexeme_builder.go_back_to(0) +			} +			if parser.builder_str() == '!--' { +				parser.lexical_attributes.open_comment = true +			} +		} else if chr == `<` { // open tag '<' +			temp_string := parser.builder_str() +			if parser.lexical_attributes.lexeme_builder.len >= 1 { +				if parser.lexical_attributes.current_tag.name.len > 1 +					&& parser.lexical_attributes.current_tag.name[0] == 47 +					&& !blank_string(temp_string) { +					parser.tags << &Tag{ +						name: 'text' +						content: temp_string +					} +				} else { +					parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content +				} +			} +			// parser.print_debug(parser.lexical_attributes.current_tag.str()) +			parser.lexical_attributes.lexeme_builder.go_back_to(0) +			parser.generate_tag() +			parser.lexical_attributes.open_tag = true +		} else { +			parser.lexical_attributes.lexeme_builder.write_b(chr) +		} +	} +} + +// parse_html parses the given HTML string +pub fn (mut parser Parser) parse_html(data string) { +	parser.init() +	mut lines := data.split_into_lines() +	for line in lines { +		parser.lexical_attributes.line_count++ +		parser.split_parse(line) +	} +	parser.generate_tag() +	parser.dom.debug_file = parser.debug_file +	parser.dom.construct(parser.tags) +} + +// finalize finishes the parsing stage . +[inline] +pub fn (mut parser Parser) finalize() { +	parser.generate_tag() +} + +// get_dom returns the parser's current DOM representation. +pub fn (mut parser Parser) get_dom() DocumentObjectModel { +	if !parser.dom.constructed { +		parser.generate_tag() +		parser.dom.construct(parser.tags) +	} +	return parser.dom +} diff --git a/v_windows/v/old/vlib/net/html/parser_test.v b/v_windows/v/old/vlib/net/html/parser_test.v new file mode 100644 index 0000000..274a47c --- /dev/null +++ b/v_windows/v/old/vlib/net/html/parser_test.v @@ -0,0 +1,41 @@ +module html + +import strings + +fn test_split_parse() { +	mut parser := Parser{} +	parser.init() +	parser.split_parse('<!doctype htm') +	parser.split_parse('l public') +	parser.split_parse('><html><he') +	parser.split_parse('ad><t') +	parser.split_parse('itle> Hum... ') +	parser.split_parse('A Tit') +	parser.split_parse('\nle</ti\ntle>') +	parser.split_parse('</\nhead><body>\t\t\t<h3>') +	parser.split_parse('Nice Test!</h3>') +	parser.split_parse('</bo\n\n\ndy></html>') +	parser.finalize() +	assert parser.tags.len == 11 +	assert parser.tags[3].content == ' Hum... A Tit\nle' +} + +fn test_giant_string() { +	mut temp_html := strings.new_builder(200) +	mut parser := Parser{} +	temp_html.write_string('<!doctype html><html><head><title>Giant String</title></head><body>') +	for counter := 0; counter < 2000; counter++ { +		temp_html.write_string("<div id='name_$counter' class='several-$counter'>Look at $counter</div>") +	} +	temp_html.write_string('</body></html>') +	parser.parse_html(temp_html.str()) +	assert parser.tags.len == 4009 +} + +fn test_script_tag() { +	mut parser := Parser{} +	script_content := "\nvar googletag = googletag || {};\ngoogletag.cmd = googletag.cmd || [];if(3 > 5) {console.log('Birl');}\n" +	temp_html := '<html><body><script>$script_content</script></body></html>' +	parser.parse_html(temp_html) +	assert parser.tags[2].content.len == script_content.replace('\n', '').len +} diff --git a/v_windows/v/old/vlib/net/html/tag.v b/v_windows/v/old/vlib/net/html/tag.v new file mode 100644 index 0000000..62260c0 --- /dev/null +++ b/v_windows/v/old/vlib/net/html/tag.v @@ -0,0 +1,68 @@ +module html + +import strings + +enum CloseTagType { +	in_name +	new_tag +} + +// Tag holds the information of an HTML tag. +[heap] +pub struct Tag { +pub mut: +	name               string +	content            string +	children           []&Tag +	attributes         map[string]string // attributes will be like map[name]value +	last_attribute     string +	parent             &Tag = 0 +	position_in_parent int +	closed             bool +	close_type         CloseTagType = .in_name +} + +fn (mut tag Tag) add_parent(t &Tag, position int) { +	tag.position_in_parent = position +	tag.parent = t +} + +fn (mut tag Tag) add_child(t &Tag) int { +	tag.children << t +	return tag.children.len +} + +// text returns the text contents of the tag. +pub fn (tag Tag) text() string { +	if tag.name.len >= 2 && tag.name[..2] == 'br' { +		return '\n' +	} +	mut text_str := strings.new_builder(200) +	text_str.write_string(tag.content.replace('\n', '')) +	for child in tag.children { +		text_str.write_string(child.text()) +	} +	return text_str.str() +} + +pub fn (tag &Tag) str() string { +	mut html_str := strings.new_builder(200) +	html_str.write_string('<$tag.name') +	for key, value in tag.attributes { +		html_str.write_string(' $key') +		if value.len > 0 { +			html_str.write_string('="$value"') +		} +	} +	html_str.write_string(if tag.closed && tag.close_type == .in_name { '/>' } else { '>' }) +	html_str.write_string(tag.content) +	if tag.children.len > 0 { +		for child in tag.children { +			html_str.write_string(child.str()) +		} +	} +	if !tag.closed || tag.close_type == .new_tag { +		html_str.write_string('</$tag.name>') +	} +	return html_str.str() +} | 
