diff --git a/blog/Writing-my-own-web-crawler-in-go/index.html b/blog/Writing-my-own-web-crawler-in-go/index.html new file mode 100644 index 0000000..779333b --- /dev/null +++ b/blog/Writing-my-own-web-crawler-in-go/index.html @@ -0,0 +1,332 @@ + + +
+ I got bored, it happens to everyone (especially software developers). + So like every software developer I've started a new side-project: a web + crawler. This isn't for any actual usecase, I kinda just wanna learn how + to use go and (hopefully) sharpen my SQL skills in the process. At this + point in the writing process I've just started so let me show you what + I've currently gotten working. +
++ To start I'm just searching through the hrefs of all <a> tags on a + site and printing them out. On my site that looks like this: +
++/ +mailto:me@zacharyscheiman.com +https://github.com/squibid +https://codeberg.org/squibid +https://git.squi.bid/squibid/wiz +https://git.squi.bid/squibid +/blog/rss.xml +/blog/New-Keyboard! +/blog/Serializing-data-in-C +/blog/Why-"suckless"-software-is-important +/blog/What-is-a-squibid +/blog/librex-and-dots +/?all_blog +https://lunarflame.dev +https://eggbert.xyz/ ++
+ Here's the code which fetched this list: +
++package main + +import ( + "fmt" + "io" + "net/http" + "golang.org/x/net/html" +) + +func deal_html(site string, reader io.Reader) { + doc, err := html.Parse(reader) + if err != nil { + fmt.Println("Error parsing HTML:", err) + return + } + + var walk func(n *html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + for _, attr := range n.Attr { + if attr.Key == "href" { + fmt.Println(attr.Val) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(doc) +} + +func main() { + site := "https://squi.bid/" + fmt.Println("fetching " + site) + resp, err := http.Get(site) + if err != nil { + fmt.Println("Error getting the website") + return + } + + defer resp.Body.Close() + deal_html(site, resp.Body) +} ++
+ After taking a short look at the output of this we need to handle + multiple different "url formats" like the mailto: and / links. For right + now I'm going to respect peoples privacy and not index their email + addresses. +
++if len(attr.Val) > 1 && attr.Val[:2] == "//" { + fmt.Println("https:" + attr.Val) +} else if attr.Val[:1] == "/" { + if site[len(site) - 1:] == "/" { + fmt.Println(site + attr.Val[1:]) + } else { + fmt.Println(site + attr.Val) + } +} else if len(attr.Val) > 4 && attr.Val[:4] == "http" { + fmt.Println(attr.Val) +} ++
+ Now that we've gotten the actual links from the website, it's time to + store and get the links from their sites too. For now I've decided that + because this is already just a toy project I will not be storing all the + info I would if this were a real project. Instead I will only be storing + the link to the site, and a boolean representing whether I'd fetched it's + contents yet. So, let's go impl... +
++ Step #1 is to find a sql library. I just went with Go's built in + database/sql mappings, and then visited + golang.org/s/sqldrivers + and decided on + github.com/mattn/go-sqlite3 + because sqlite is a name I'm familiar with, and I really don't want to go + through the hassle of looking into different dbs for a toy project. + [1] +
++ Now that we've chosen our db I'll setup our table like I mentioned + earlier, with one string and one boolean: +
++db, err = sql.Open("sqlite3", "./sites.db") +if err != nil { + log.Fatal(err) +} +defer db.Close() +db.Exec(` + create table if not exists + urls (url text not null primary key, indexed boolean not null); + `) ++
+ Now we need to start adding entries to the db. To do this I wanted to + ensure I wouldn't end up shooting myself in the foot therefore I decided + to go with a small tiny function to make it a teensy tiny bit safer: +
++func db_insert_url(url string, seen bool) { + db.Exec(`insert into urls values (?, ?)`, url, seen) +} ++
+ It could use some error handling, but if you look at section 3 part 4 of + the software engineers manual it reads "side projects aren't stable + because if it's stable it's not a side project". +
++ Now that we've gotten all the easy stuff out of the way it's time to work + on making this run forever, or close to it at least. For now I'm going to + keep this project in an inefficient state and we're not going to use any + worker pools or something fancy like that. To get started we first need + to make a decision: depth or breadth first searching? incase you're not + sure what I mean by this let me give you an example: +
++ Let's say we have site example-a.com which contains the following links: +
++ With a breadth first search we would first go to either example-b or + example-c wheras with a depth first search we would go with + example-a.com/blog. For my use case I want to find as many sites as + possible therefore I will be targeting sites with other base urls. +
++ Now that we know how we want to decide the next url to fetch let's impl + the loop which handles this. +
++for i := 0;; i++ { + if i > 0 { + rows, err := db.Query(`select url from urls where indexed is false`) + if err != nil { + return + } + + for rows.Next() { + var test string + rows.Scan(&test) + site = test + /* we can't just check if the site is the same because then when we're + * checking squi.bid/example it won't register squi.bid as the same + * domain, although maybe that's what we want. + */ + if !strings.Contains(test, site) { + break + } + } + rows.Close() + } + + fmt.Println("fetching " + site) + resp, err := http.Get(site) + if err != nil { + fmt.Println("Error getting", site) + os.Exit(1) + } + + deal_html(site, resp.Body) + + resp.Body.Close() +} ++
+ If you read through my code you might've seen the comment about how our + check doesn't actually prevent accessing the same site, the solution I'm + currently thinking of is to add a column to the db which keeps the + highest point in the site for example: squi.bid/example/1/2/3/4 would + have a highest point of squi.bid. But currently this isn't something I'm + too concerned about so for now we'll just leave it as is and deal with + another issue you might've spotted. +
++ We don't modify the db, after fetching a site successfully at no point do + we actually say that we fetched it. Therefore whenever we try and fetch + a new site the program with query the db and find the same route as + before. Thankfully this is a simple fix which just takes adding this line + right after where we index a new site: +
++db.Exec(`update urls set indexed = true where url == ?`, site) ++
+ Remember when I referenced section 3 part 4 of the software engineers + manual? Well I regret it: +
++fetching https://squi.bid/ +fetching https://github.com/EggbertFluffle/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E&source=header +panic: runtime error: slice bounds out of range [:1] with length 0 + +goroutine 1 [running]: +main.deal_html.func1(0xc000315b90) + /home/squibid/documents/coding/go/scraper/main.go:40 +0x2fe +main.deal_html.func1(0x0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0xc0002dd110?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0xc00032e000?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x8c36e0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x7fd180fd9db8?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html({0xc00002a280, 0x7c}, {0x7fd180fd9db8?, 0xc0004bd200?}) + /home/squibid/documents/coding/go/scraper/main.go:58 +0x11e +main.main() + /home/squibid/documents/coding/go/scraper/main.go:105 +0x12d +exit status 2 ++
+ Turns out we need some stability if we want to actually use the code. + This like most bugs is another simple fix which just takes guarding our + url handler with a call to len to make sure we're not doing anything + stupid on empty strings. +
++ And now it works! With a small exception, but here's a clean run showing + my lil web crawler doing it's thing... and failing pretty fast. +
++fetching https://squi.bid/ +fetching https://eggbert.xyz/ +fetching https://www.linkedin.com/in/harrison-diambrosio-505443229/ +fetching https://github.com/EggbertFluffle/ +fetching https://support.github.com?tags=dotcom-footer +fetching https://docs.github.com/ +fetching https://services.github.com +fetching https://github.com/github +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides +... ++
+ I'm sure you can use your imagination to figure out how long that was + going to happen for. This bug would be partially fixed by switching + which site we're searching, but ultimately we wouldn't make it far if we + keep falling for these redirections that just keep going. For now that's + fine though, and I have a semi-working web crawler. All code can be found + here: + git.squi.bid/squibid/web-crawler. + Thank you for reading, I'll probably write a followup when I find some + time. +
++ [1] Yes I'm just including more links to make my site a good starting point + why do you ask? +
+ + diff --git a/blog/rss.xml b/blog/rss.xml index 606d470..0e182c0 100644 --- a/blog/rss.xml +++ b/blog/rss.xml @@ -11,6 +11,336 @@ ++ I got bored, it happens to everyone (especially software developers). + So like every software developer I've started a new side-project: a web + crawler. This isn't for any actual usecase, I kinda just wanna learn how + to use go and (hopefully) sharpen my SQL skills in the process. At this + point in the writing process I've just started so let me show you what + I've currently gotten working. +
++ To start I'm just searching through the hrefs of all <a> tags on a + site and printing them out. On my site that looks like this: +
++/ +mailto:me@zacharyscheiman.com +https://github.com/squibid +https://codeberg.org/squibid +https://git.squi.bid/squibid/wiz +https://git.squi.bid/squibid +/blog/rss.xml +/blog/New-Keyboard! +/blog/Serializing-data-in-C +/blog/Why-"suckless"-software-is-important +/blog/What-is-a-squibid +/blog/librex-and-dots +/?all_blog +https://lunarflame.dev +https://eggbert.xyz/ ++
+ Here's the code which fetched this list: +
++package main +import ( + "fmt" + "io" + "net/http" + "golang.org/x/net/html" +) +func deal_html(site string, reader io.Reader) { + doc, err := html.Parse(reader) + if err != nil { + fmt.Println("Error parsing HTML:", err) + return + } + var walk func(n *html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + for _, attr := range n.Attr { + if attr.Key == "href" { + fmt.Println(attr.Val) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(doc) +} +func main() { + site := "https://squi.bid/" + fmt.Println("fetching " + site) + resp, err := http.Get(site) + if err != nil { + fmt.Println("Error getting the website") + return + } + defer resp.Body.Close() + deal_html(site, resp.Body) +} ++
+ After taking a short look at the output of this we need to handle + multiple different "url formats" like the mailto: and / links. For right + now I'm going to respect peoples privacy and not index their email + addresses. +
++if len(attr.Val) > 1 && attr.Val[:2] == "//" { + fmt.Println("https:" + attr.Val) +} else if attr.Val[:1] == "/" { + if site[len(site) - 1:] == "/" { + fmt.Println(site + attr.Val[1:]) + } else { + fmt.Println(site + attr.Val) + } +} else if len(attr.Val) > 4 && attr.Val[:4] == "http" { + fmt.Println(attr.Val) +} ++
+ Now that we've gotten the actual links from the website, it's time to + store and get the links from their sites too. For now I've decided that + because this is already just a toy project I will not be storing all the + info I would if this were a real project. Instead I will only be storing + the link to the site, and a boolean representing whether I'd fetched it's + contents yet. So, let's go impl... +
++ Step #1 is to find a sql library. I just went with Go's built in + database/sql mappings, and then visited + golang.org/s/sqldrivers + and decided on + github.com/mattn/go-sqlite3 + because sqlite is a name I'm familiar with, and I really don't want to go + through the hassle of looking into different dbs for a toy project. + [1] +
++ Now that we've chosen our db I'll setup our table like I mentioned + earlier, with one string and one boolean: +
++db, err = sql.Open("sqlite3", "./sites.db") +if err != nil { + log.Fatal(err) +} +defer db.Close() +db.Exec(` + create table if not exists + urls (url text not null primary key, indexed boolean not null); + `) ++
+ Now we need to start adding entries to the db. To do this I wanted to + ensure I wouldn't end up shooting myself in the foot therefore I decided + to go with a small tiny function to make it a teensy tiny bit safer: +
++func db_insert_url(url string, seen bool) { + db.Exec(`insert into urls values (?, ?)`, url, seen) +} ++
+ It could use some error handling, but if you look at section 3 part 4 of + the software engineers manual it reads "side projects aren't stable + because if it's stable it's not a side project". +
++ Now that we've gotten all the easy stuff out of the way it's time to work + on making this run forever, or close to it at least. For now I'm going to + keep this project in an inefficient state and we're not going to use any + worker pools or something fancy like that. To get started we first need + to make a decision: depth or breadth first searching? incase you're not + sure what I mean by this let me give you an example: +
++ Let's say we have site example-a.com which contains the following links: +
++ With a breadth first search we would first go to either example-b or + example-c wheras with a depth first search we would go with + example-a.com/blog. For my use case I want to find as many sites as + possible therefore I will be targeting sites with other base urls. +
++ Now that we know how we want to decide the next url to fetch let's impl + the loop which handles this. +
++for i := 0;; i++ { + if i > 0 { + rows, err := db.Query(`select url from urls where indexed is false`) + if err != nil { + return + } + for rows.Next() { + var test string + rows.Scan(&test) + site = test + /* we can't just check if the site is the same because then when we're + * checking squi.bid/example it won't register squi.bid as the same + * domain, although maybe that's what we want. + */ + if !strings.Contains(test, site) { + break + } + } + rows.Close() + } + fmt.Println("fetching " + site) + resp, err := http.Get(site) + if err != nil { + fmt.Println("Error getting", site) + os.Exit(1) + } + deal_html(site, resp.Body) + resp.Body.Close() +} ++
+ If you read through my code you might've seen the comment about how our + check doesn't actually prevent accessing the same site, the solution I'm + currently thinking of is to add a column to the db which keeps the + highest point in the site for example: squi.bid/example/1/2/3/4 would + have a highest point of squi.bid. But currently this isn't something I'm + too concerned about so for now we'll just leave it as is and deal with + another issue you might've spotted. +
++ We don't modify the db, after fetching a site successfully at no point do + we actually say that we fetched it. Therefore whenever we try and fetch + a new site the program with query the db and find the same route as + before. Thankfully this is a simple fix which just takes adding this line + right after where we index a new site: +
++db.Exec(`update urls set indexed = true where url == ?`, site) ++
+ Remember when I referenced section 3 part 4 of the software engineers + manual? Well I regret it: +
++fetching https://squi.bid/ +fetching https://github.com/EggbertFluffle/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E&source=header +panic: runtime error: slice bounds out of range [:1] with length 0 +goroutine 1 [running]: +main.deal_html.func1(0xc000315b90) + /home/squibid/documents/coding/go/scraper/main.go:40 +0x2fe +main.deal_html.func1(0x0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0xc0002dd110?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0xc00032e000?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x8c36e0?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html.func1(0x7fd180fd9db8?) + /home/squibid/documents/coding/go/scraper/main.go:55 +0x83 +main.deal_html({0xc00002a280, 0x7c}, {0x7fd180fd9db8?, 0xc0004bd200?}) + /home/squibid/documents/coding/go/scraper/main.go:58 +0x11e +main.main() + /home/squibid/documents/coding/go/scraper/main.go:105 +0x12d +exit status 2 ++
+ Turns out we need some stability if we want to actually use the code. + This like most bugs is another simple fix which just takes guarding our + url handler with a call to len to make sure we're not doing anything + stupid on empty strings. +
++ And now it works! With a small exception, but here's a clean run showing + my lil web crawler doing it's thing... and failing pretty fast. +
++fetching https://squi.bid/ +fetching https://eggbert.xyz/ +fetching https://www.linkedin.com/in/harrison-diambrosio-505443229/ +fetching https://github.com/EggbertFluffle/ +fetching https://support.github.com?tags=dotcom-footer +fetching https://docs.github.com/ +fetching https://services.github.com +fetching https://github.com/github +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides +fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides +... ++
+ I'm sure you can use your imagination to figure out how long that was + going to happen for. This bug would be partially fixed by switching + which site we're searching, but ultimately we wouldn't make it far if we + keep falling for these redirections that just keep going. For now that's + fine though, and I have a semi-working web crawler. All code can be found + here: + git.squi.bid/squibid/web-crawler. + Thank you for reading, I'll probably write a followup when I find some + time. +
++ [1] Yes I'm just including more links to make my site a good starting point + why do you ask? +
+ +]]>