From f5c4671bfbad96bf346bd7e9a21fc4317b4959df Mon Sep 17 00:00:00 2001 From: Indrajith K L Date: Sat, 3 Dec 2022 17:00:20 +0530 Subject: Adds most of the tools --- v_windows/v/vlib/net/html/README.md | 16 ++ v_windows/v/vlib/net/html/data_structures.v | 91 ++++++++++ v_windows/v/vlib/net/html/dom.v | 189 ++++++++++++++++++++ v_windows/v/vlib/net/html/dom_test.v | 56 ++++++ v_windows/v/vlib/net/html/html.v | 18 ++ v_windows/v/vlib/net/html/html_test.v | 15 ++ v_windows/v/vlib/net/html/parser.v | 260 ++++++++++++++++++++++++++++ v_windows/v/vlib/net/html/parser_test.v | 41 +++++ v_windows/v/vlib/net/html/tag.v | 68 ++++++++ 9 files changed, 754 insertions(+) create mode 100644 v_windows/v/vlib/net/html/README.md create mode 100644 v_windows/v/vlib/net/html/data_structures.v create mode 100644 v_windows/v/vlib/net/html/dom.v create mode 100644 v_windows/v/vlib/net/html/dom_test.v create mode 100644 v_windows/v/vlib/net/html/html.v create mode 100644 v_windows/v/vlib/net/html/html_test.v create mode 100644 v_windows/v/vlib/net/html/parser.v create mode 100644 v_windows/v/vlib/net/html/parser_test.v create mode 100644 v_windows/v/vlib/net/html/tag.v (limited to 'v_windows/v/vlib/net/html') diff --git a/v_windows/v/vlib/net/html/README.md b/v_windows/v/vlib/net/html/README.md new file mode 100644 index 0000000..a92a6e6 --- /dev/null +++ b/v_windows/v/vlib/net/html/README.md @@ -0,0 +1,16 @@ +net/http is an HTML written in pure V. + +## Usage +```v oksyntax +import net.html + +fn main() { + doc := html.parse('

Hello world!

') + tag := doc.get_tag('h1')[0] //

Hello world!

+ println(tag.name) // h1 + println(tag.content) // Hello world! + println(tag.attributes) // {'class':'title'} + println(tag.str()) //

Hello world!

+} +``` +More examples found on [`parser_test.v`](parser_test.v) and [`html_test.v`](html_test.v) diff --git a/v_windows/v/vlib/net/html/data_structures.v b/v_windows/v/vlib/net/html/data_structures.v new file mode 100644 index 0000000..688b756 --- /dev/null +++ b/v_windows/v/vlib/net/html/data_structures.v @@ -0,0 +1,91 @@ +module html + +const ( + null_element = int(0x80000000) +) + +struct Stack { +mut: + elements []int + size int +} + +[inline] +fn is_null(data int) bool { + return data == html.null_element +} + +[inline] +fn (stack Stack) is_empty() bool { + return stack.size <= 0 +} + +fn (stack Stack) peek() int { + return if !stack.is_empty() { stack.elements[stack.size - 1] } else { html.null_element } +} + +fn (mut stack Stack) pop() int { + mut to_return := html.null_element + if !stack.is_empty() { + to_return = stack.elements[stack.size - 1] + stack.size-- + } + return to_return +} + +fn (mut stack Stack) push(item int) { + if stack.elements.len > stack.size { + stack.elements[stack.size] = item + } else { + stack.elements << item + } + stack.size++ +} + +struct BTree { +mut: + all_tags []Tag + node_pointer int + childrens [][]int + parents []int +} + +fn (mut btree BTree) add_children(tag Tag) int { + btree.all_tags << tag + if btree.all_tags.len > 1 { + for btree.childrens.len <= btree.node_pointer { + mut temp_array := btree.childrens + temp_array << []int{} + btree.childrens = temp_array + } + btree.childrens[btree.node_pointer] << btree.all_tags.len - 1 + for btree.parents.len < btree.all_tags.len { + mut temp_array := btree.parents + temp_array << 0 + btree.parents = temp_array + } + btree.parents[btree.all_tags.len - 1] = btree.node_pointer + } + return btree.all_tags.len - 1 +} + +[inline] +fn (btree BTree) get_children() []int { + return btree.childrens[btree.node_pointer] +} + +[inline] +fn (btree BTree) get_parent() int { + return btree.parents[btree.node_pointer] +} + +[inline] +fn (btree BTree) get_stored() Tag { + return btree.all_tags[btree.node_pointer] +} + +fn (mut btree BTree) move_pointer(to int) { + if to < btree.all_tags.len { + btree.node_pointer = to + } +} diff --git a/v_windows/v/vlib/net/html/dom.v b/v_windows/v/vlib/net/html/dom.v new file mode 100644 index 0000000..b145ddc --- /dev/null +++ b/v_windows/v/vlib/net/html/dom.v @@ -0,0 +1,189 @@ +module html + +import os + +// The W3C Document Object Model (DOM) is a platform and language-neutral +// interface that allows programs and scripts to dynamically access and +// update the content, structure, and style of a document. +// +// https://www.w3.org/TR/WD-DOM/introduction.html +pub struct DocumentObjectModel { +mut: + root &Tag + constructed bool + btree BTree + all_tags []&Tag + all_attributes map[string][]&Tag + close_tags map[string]bool // add a counter to see count how many times is closed and parse correctly + attributes map[string][]string + tag_attributes map[string][][]&Tag + tag_type map[string][]&Tag + debug_file os.File +} + +[if debug] +fn (mut dom DocumentObjectModel) print_debug(data string) { + $if debug { + if data.len > 0 { + dom.debug_file.writeln(data) or { panic(err) } + } + } +} + +[inline] +fn is_close_tag(tag &Tag) bool { + return tag.name.len > 0 && tag.name[0] == `/` +} + +fn (mut dom DocumentObjectModel) where_is(item_name string, attribute_name string) int { + if attribute_name !in dom.attributes { + dom.attributes[attribute_name] = []string{} + } + mut string_array := dom.attributes[attribute_name] + mut counter := 0 + for value in string_array { + if value == item_name { + return counter + } + counter++ + } + string_array << item_name + dom.attributes[attribute_name] = string_array + return string_array.len - 1 +} + +fn (mut dom DocumentObjectModel) add_tag_attribute(tag &Tag) { + for attribute_name, _ in tag.attributes { + attribute_value := tag.attributes[attribute_name] + location := dom.where_is(attribute_value, attribute_name) + if attribute_name !in dom.tag_attributes { + dom.tag_attributes[attribute_name] = [] + } + for { + mut temp_array := dom.tag_attributes[attribute_name] + temp_array << []&Tag{} + dom.tag_attributes[attribute_name] = temp_array + if location < dom.tag_attributes[attribute_name].len + 1 { + break + } + } + mut temp_array := dom.tag_attributes[attribute_name][location] + temp_array << tag + dom.tag_attributes[attribute_name][location] = temp_array + } +} + +fn (mut dom DocumentObjectModel) add_tag_by_type(tag &Tag) { + tag_name := tag.name + if tag_name !in dom.tag_type { + dom.tag_type[tag_name] = [tag] + } else { + mut temp_array := dom.tag_type[tag_name] + temp_array << tag + dom.tag_type[tag_name] = temp_array + } +} + +fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) { + for attribute_name in tag.attributes.keys() { + if attribute_name !in dom.all_attributes { + dom.all_attributes[attribute_name] = [tag] + } else { + mut temp_array := dom.all_attributes[attribute_name] + temp_array << tag + dom.all_attributes[attribute_name] = temp_array + } + } +} + +fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) { + dom.constructed = true + mut temp_map := map[string]int{} + mut temp_int := null_element + mut temp_string := '' + mut stack := Stack{} + dom.btree = BTree{} + dom.root = tag_list[0] + dom.all_tags = [tag_list[0]] + temp_map['0'] = dom.btree.add_children(tag_list[0]) + stack.push(0) + root_index := 0 + for index := 1; index < tag_list.len; index++ { + mut tag := tag_list[index] + dom.print_debug(tag.str()) + if is_close_tag(tag) { + temp_int = stack.peek() + temp_string = tag.name[1..] + for !is_null(temp_int) && temp_string != tag_list[temp_int].name + && !tag_list[temp_int].closed { + dom.print_debug(temp_string + ' >> ' + tag_list[temp_int].name + ' ' + + (temp_string == tag_list[temp_int].name).str()) + stack.pop() + temp_int = stack.peek() + } + temp_int = stack.peek() + temp_int = if !is_null(temp_int) { stack.pop() } else { root_index } + if is_null(temp_int) { + stack.push(root_index) + } + dom.print_debug('Removed ' + temp_string + ' -- ' + tag_list[temp_int].name) + } else if tag.name.len > 0 { + dom.add_tag_attribute(tag) // error here + dom.add_tag_by_attribute(tag) + dom.add_tag_by_type(tag) + dom.all_tags << tag + temp_int = stack.peek() + if !is_null(temp_int) { + dom.btree.move_pointer(temp_map[temp_int.str()]) + temp_map[index.str()] = dom.btree.add_children(tag) + mut temp_tag := tag_list[temp_int] + position_in_parent := temp_tag.add_child(tag) // tag_list[temp_int] = temp_tag + tag.add_parent(temp_tag, position_in_parent) + /* + dom.print_debug("Added ${tag.name} as child of '" + tag_list[temp_int].name + + "' which now has ${dom.btree.get_children().len} childrens") + */ + dom.print_debug("Added $tag.name as child of '" + temp_tag.name + + "' which now has $temp_tag.children.len childrens") + } else { // dom.new_root(tag) + stack.push(root_index) + } + temp_string = '/' + tag.name + if temp_string in dom.close_tags && !tag.closed { // if tag ends with /> + dom.print_debug('Pushed ' + temp_string) + stack.push(index) + } + } + } // println(tag_list[root_index]) for debug purposes + dom.root = tag_list[0] +} + +// get_tag_by_attribute_value retrieves all the tags in the document that has the given attribute name and value. +pub fn (mut dom DocumentObjectModel) get_tag_by_attribute_value(name string, value string) []&Tag { + location := dom.where_is(value, name) + return if dom.tag_attributes[name].len > location { + dom.tag_attributes[name][location] + } else { + []&Tag{} + } +} + +// get_tag retrieves all the tags in the document that has the given tag name. +pub fn (dom DocumentObjectModel) get_tag(name string) []&Tag { + return if name in dom.tag_type { dom.tag_type[name] } else { []&Tag{} } +} + +// get_tag_by_attribute retrieves all the tags in the document that has the given attribute name. +pub fn (dom DocumentObjectModel) get_tag_by_attribute(name string) []&Tag { + return if name in dom.all_attributes { dom.all_attributes[name] } else { []&Tag{} } +} + +// get_root returns the root of the document. +pub fn (dom DocumentObjectModel) get_root() &Tag { + return dom.root +} + +// get_tags returns all of the tags stored in the document. +pub fn (dom DocumentObjectModel) get_tags() []&Tag { + return dom.all_tags +} diff --git a/v_windows/v/vlib/net/html/dom_test.v b/v_windows/v/vlib/net/html/dom_test.v new file mode 100644 index 0000000..d4fd292 --- /dev/null +++ b/v_windows/v/vlib/net/html/dom_test.v @@ -0,0 +1,56 @@ +module html + +import strings + +fn generate_temp_html() string { + mut temp_html := strings.new_builder(200) + temp_html.write_string('Giant String') + for counter := 0; counter < 4; counter++ { + temp_html.write_string("

Look at $counter

") + } + temp_html.write_string('') + return temp_html.str() +} + +fn test_search_by_tag_type() { + dom := parse(generate_temp_html()) + assert dom.get_tag('div').len == 4 + assert dom.get_tag('head').len == 1 + assert dom.get_tag('body').len == 1 +} + +fn test_search_by_attribute_value() { + mut dom := parse(generate_temp_html()) + // println(temp_html) + print('Amount ') + println(dom.get_tag_by_attribute_value('id', 'name_0')) + assert dom.get_tag_by_attribute_value('id', 'name_0').len == 1 +} + +fn test_access_parent() { + mut dom := parse(generate_temp_html()) + div_tags := dom.get_tag('div') + parent := div_tags[0].parent + assert parent != 0 + for div_tag in div_tags { + assert div_tag.parent == parent + } +} + +fn test_search_by_attributes() { + dom := parse(generate_temp_html()) + assert dom.get_tag_by_attribute('id').len == 4 +} + +fn test_tags_used() { + dom := parse(generate_temp_html()) + assert dom.get_tags().len == 9 +} + +fn test_access_tag_fields() { + dom := parse(generate_temp_html()) + id_tags := dom.get_tag_by_attribute('id') + assert id_tags[0].name == 'div' + assert id_tags[1].attributes['class'] == 'several-1' +} diff --git a/v_windows/v/vlib/net/html/html.v b/v_windows/v/vlib/net/html/html.v new file mode 100644 index 0000000..293b643 --- /dev/null +++ b/v_windows/v/vlib/net/html/html.v @@ -0,0 +1,18 @@ +module html + +import os + +// parse parses and returns the DOM from the given text. +pub fn parse(text string) DocumentObjectModel { + mut parser := Parser{} + parser.parse_html(text) + return parser.get_dom() +} + +// parse_file parses and returns the DOM from the contents of a file. +pub fn parse_file(filename string) DocumentObjectModel { + content := os.read_file(filename) or { return DocumentObjectModel{ + root: &Tag{} + } } + return parse(content) +} diff --git a/v_windows/v/vlib/net/html/html_test.v b/v_windows/v/vlib/net/html/html_test.v new file mode 100644 index 0000000..51271cd --- /dev/null +++ b/v_windows/v/vlib/net/html/html_test.v @@ -0,0 +1,15 @@ +module html + +fn test_parse() { + doc := parse('

Hello world!

') + tags := doc.get_tag('h1') + assert tags.len == 1 + h1_tag := tags[0] //

Hello world!

+ assert h1_tag.name == 'h1' + assert h1_tag.content == 'Hello world!' + assert h1_tag.attributes.len == 2 + // TODO: do not remove. Attributes must not have an empty attr. + // assert h1_tag.attributes.len == 1 + assert h1_tag.str() == '

Hello world!

' + // assert h1_tag.str() == '

Hello world!

' +} diff --git a/v_windows/v/vlib/net/html/parser.v b/v_windows/v/vlib/net/html/parser.v new file mode 100644 index 0000000..5b9bbd1 --- /dev/null +++ b/v_windows/v/vlib/net/html/parser.v @@ -0,0 +1,260 @@ +module html + +import os +import strings + +struct LexicalAttributes { +mut: + current_tag &Tag + open_tag bool + open_code bool + open_string int + open_comment bool + is_attribute bool + opened_code_type string + line_count int + lexeme_builder strings.Builder = strings.new_builder(100) + code_tags map[string]bool = { + 'script': true + 'style': true + } +} + +// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`. +pub struct Parser { +mut: + dom DocumentObjectModel + lexical_attributes LexicalAttributes = LexicalAttributes{ + current_tag: &Tag{} + } + filename string = 'direct-parse' + initialized bool + tags []&Tag + debug_file os.File +} + +// This function is used to add a tag for the parser ignore it's content. +// For example, if you have an html or XML with a custom tag, like `' + parser.parse_html(temp_html) + assert parser.tags[2].content.len == script_content.replace('\n', '').len +} diff --git a/v_windows/v/vlib/net/html/tag.v b/v_windows/v/vlib/net/html/tag.v new file mode 100644 index 0000000..62260c0 --- /dev/null +++ b/v_windows/v/vlib/net/html/tag.v @@ -0,0 +1,68 @@ +module html + +import strings + +enum CloseTagType { + in_name + new_tag +} + +// Tag holds the information of an HTML tag. +[heap] +pub struct Tag { +pub mut: + name string + content string + children []&Tag + attributes map[string]string // attributes will be like map[name]value + last_attribute string + parent &Tag = 0 + position_in_parent int + closed bool + close_type CloseTagType = .in_name +} + +fn (mut tag Tag) add_parent(t &Tag, position int) { + tag.position_in_parent = position + tag.parent = t +} + +fn (mut tag Tag) add_child(t &Tag) int { + tag.children << t + return tag.children.len +} + +// text returns the text contents of the tag. +pub fn (tag Tag) text() string { + if tag.name.len >= 2 && tag.name[..2] == 'br' { + return '\n' + } + mut text_str := strings.new_builder(200) + text_str.write_string(tag.content.replace('\n', '')) + for child in tag.children { + text_str.write_string(child.text()) + } + return text_str.str() +} + +pub fn (tag &Tag) str() string { + mut html_str := strings.new_builder(200) + html_str.write_string('<$tag.name') + for key, value in tag.attributes { + html_str.write_string(' $key') + if value.len > 0 { + html_str.write_string('="$value"') + } + } + html_str.write_string(if tag.closed && tag.close_type == .in_name { '/>' } else { '>' }) + html_str.write_string(tag.content) + if tag.children.len > 0 { + for child in tag.children { + html_str.write_string(child.str()) + } + } + if !tag.closed || tag.close_type == .new_tag { + html_str.write_string('') + } + return html_str.str() +} -- cgit v1.2.3