From f5c4671bfbad96bf346bd7e9a21fc4317b4959df Mon Sep 17 00:00:00 2001 From: Indrajith K L Date: Sat, 3 Dec 2022 17:00:20 +0530 Subject: Adds most of the tools --- v_windows/v/examples/web_crawler/README.md | 22 ++++++++++++++++++++++ v_windows/v/examples/web_crawler/web_crawler.v | 24 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 v_windows/v/examples/web_crawler/README.md create mode 100644 v_windows/v/examples/web_crawler/web_crawler.v (limited to 'v_windows/v/examples/web_crawler') diff --git a/v_windows/v/examples/web_crawler/README.md b/v_windows/v/examples/web_crawler/README.md new file mode 100644 index 0000000..c8a741f --- /dev/null +++ b/v_windows/v/examples/web_crawler/README.md @@ -0,0 +1,22 @@ +# web_crawler +web_crawler is a very simple web crawler. +This web crawler fetches news from tuicool.com, +(a chinese site similar to hacker-news.firebaseio.com). + +# Compile and Run + +Use this to generate an executable, and then launch the web crawler: +```bash +v web_crawler.v +./web_crawler +``` + +And this to compile and launch the web crawler directly: +```bash +v run web_crawler.v +``` + +This project shows how to use http.fetch() to get http.Response, +and then html.parse() to parse the returned html. + +It's easy, isn't it? diff --git a/v_windows/v/examples/web_crawler/web_crawler.v b/v_windows/v/examples/web_crawler/web_crawler.v new file mode 100644 index 0000000..00d4dfa --- /dev/null +++ b/v_windows/v/examples/web_crawler/web_crawler.v @@ -0,0 +1,24 @@ +import net.http +import net.html + +fn main() { + // http.fetch() sends an HTTP request to the URL with the given method and configurations. + config := http.FetchConfig{ + user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' + } + resp := http.fetch(http.FetchConfig{ ...config, url: 'https://tuicool.com' }) or { + println('failed to fetch data from the server') + return + } + // html.parse() parses and returns the DOM from the given text. + mut doc := html.parse(resp.text) + // html.DocumentObjectModel.get_tag_by_attribute_value() retrieves all the tags in the document that has the given attribute name and value. + tags := doc.get_tag_by_attribute_value('class', 'list_article_item') + for tag in tags { + href := tag.children[0].attributes['href'] or { panic('key not found') } + title := tag.children[0].attributes['title'] or { panic('key not found') } + println('href: $href') + println('title: $title') + println('') + } +} -- cgit v1.2.3