From 53635476374e0071410415b4a34623a21315dabb Mon Sep 17 00:00:00 2001 From: Squibid Date: Sat, 23 Aug 2025 20:28:44 -0400 Subject: [PATCH] new blogpost --- .../index.html | 332 ++++++++++++++++++ blog/rss.xml | 330 +++++++++++++++++ 2 files changed, 662 insertions(+) create mode 100644 blog/Writing-my-own-web-crawler-in-go/index.html diff --git a/blog/Writing-my-own-web-crawler-in-go/index.html b/blog/Writing-my-own-web-crawler-in-go/index.html new file mode 100644 index 0000000..779333b --- /dev/null +++ b/blog/Writing-my-own-web-crawler-in-go/index.html @@ -0,0 +1,332 @@ + + + 'Writing my own web crawler in go' + + + + +

Writing my own web crawler in go

+

+ I got bored, it happens to everyone (especially software developers). + So like every software developer I've started a new side-project: a web + crawler. This isn't for any actual usecase, I kinda just wanna learn how + to use go and (hopefully) sharpen my SQL skills in the process. At this + point in the writing process I've just started so let me show you what + I've currently gotten working. +

+

+ To start I'm just searching through the hrefs of all <a> tags on a + site and printing them out. On my site that looks like this: +

+
+/
+mailto:me@zacharyscheiman.com
+https://github.com/squibid
+https://codeberg.org/squibid
+https://git.squi.bid/squibid/wiz
+https://git.squi.bid/squibid
+/blog/rss.xml
+/blog/New-Keyboard!
+/blog/Serializing-data-in-C
+/blog/Why-"suckless"-software-is-important
+/blog/What-is-a-squibid
+/blog/librex-and-dots
+/?all_blog
+https://lunarflame.dev
+https://eggbert.xyz/
+		
+

+ Here's the code which fetched this list: +

+
+package main
+
+import (
+  "fmt"
+  "io"
+  "net/http"
+  "golang.org/x/net/html"
+)
+
+func deal_html(site string, reader io.Reader) {
+  doc, err := html.Parse(reader)
+  if err != nil {
+    fmt.Println("Error parsing HTML:", err)
+    return
+  }
+
+  var walk func(n *html.Node)
+  walk = func(n *html.Node) {
+    if n.Type == html.ElementNode && n.Data == "a" {
+      for _, attr := range n.Attr {
+        if attr.Key == "href" {
+          fmt.Println(attr.Val)
+        }
+      }
+    }
+    for c := n.FirstChild; c != nil; c = c.NextSibling {
+      walk(c)
+    }
+  }
+  walk(doc)
+}
+
+func main() {
+  site := "https://squi.bid/"
+  fmt.Println("fetching " + site)
+  resp, err := http.Get(site)
+  if err != nil {
+    fmt.Println("Error getting the website")
+    return
+  }
+
+  defer resp.Body.Close()
+  deal_html(site, resp.Body)
+}
+
+

+ After taking a short look at the output of this we need to handle + multiple different "url formats" like the mailto: and / links. For right + now I'm going to respect peoples privacy and not index their email + addresses. +

+
+if len(attr.Val) > 1 && attr.Val[:2] == "//" {
+  fmt.Println("https:" + attr.Val)
+} else if attr.Val[:1] == "/" {
+  if site[len(site) - 1:] == "/" {
+    fmt.Println(site + attr.Val[1:])
+  } else {
+    fmt.Println(site + attr.Val)
+  }
+} else if len(attr.Val) > 4 && attr.Val[:4] == "http" {
+  fmt.Println(attr.Val)
+}
+		
+

+ Now that we've gotten the actual links from the website, it's time to + store and get the links from their sites too. For now I've decided that + because this is already just a toy project I will not be storing all the + info I would if this were a real project. Instead I will only be storing + the link to the site, and a boolean representing whether I'd fetched it's + contents yet. So, let's go impl... +

+

+ Step #1 is to find a sql library. I just went with Go's built in + database/sql mappings, and then visited + golang.org/s/sqldrivers + and decided on + github.com/mattn/go-sqlite3 + because sqlite is a name I'm familiar with, and I really don't want to go + through the hassle of looking into different dbs for a toy project. + [1] +

+

+ Now that we've chosen our db I'll setup our table like I mentioned + earlier, with one string and one boolean: +

+
+db, err = sql.Open("sqlite3", "./sites.db")
+if err != nil {
+  log.Fatal(err)
+}
+defer db.Close()
+db.Exec(`
+    create table if not exists
+    urls (url text not null primary key, indexed boolean not null);
+    `)
+
+

+ Now we need to start adding entries to the db. To do this I wanted to + ensure I wouldn't end up shooting myself in the foot therefore I decided + to go with a small tiny function to make it a teensy tiny bit safer: +

+
+func db_insert_url(url string, seen bool) {
+  db.Exec(`insert into urls values (?, ?)`, url, seen)
+}
+
+

+ It could use some error handling, but if you look at section 3 part 4 of + the software engineers manual it reads "side projects aren't stable + because if it's stable it's not a side project". +

+

+ Now that we've gotten all the easy stuff out of the way it's time to work + on making this run forever, or close to it at least. For now I'm going to + keep this project in an inefficient state and we're not going to use any + worker pools or something fancy like that. To get started we first need + to make a decision: depth or breadth first searching? incase you're not + sure what I mean by this let me give you an example: +

+

+ Let's say we have site example-a.com which contains the following links: +

+ +

+ With a breadth first search we would first go to either example-b or + example-c wheras with a depth first search we would go with + example-a.com/blog. For my use case I want to find as many sites as + possible therefore I will be targeting sites with other base urls. +

+

+ Now that we know how we want to decide the next url to fetch let's impl + the loop which handles this. +

+
+for i := 0;; i++ {
+  if i > 0 {
+    rows, err := db.Query(`select url from urls where indexed is false`)
+    if err != nil {
+      return
+    }
+
+    for rows.Next() {
+      var test string
+      rows.Scan(&test)
+      site = test
+      /* we can't just check if the site is the same because then when we're
+       * checking squi.bid/example it won't register squi.bid as the same
+       * domain, although maybe that's what we want.
+       */
+      if !strings.Contains(test, site) {
+        break
+      }
+    }
+    rows.Close()
+  }
+
+  fmt.Println("fetching " + site)
+  resp, err := http.Get(site)
+  if err != nil {
+    fmt.Println("Error getting", site)
+    os.Exit(1)
+  }
+
+  deal_html(site, resp.Body)
+
+  resp.Body.Close()
+}
+
+

+ If you read through my code you might've seen the comment about how our + check doesn't actually prevent accessing the same site, the solution I'm + currently thinking of is to add a column to the db which keeps the + highest point in the site for example: squi.bid/example/1/2/3/4 would + have a highest point of squi.bid. But currently this isn't something I'm + too concerned about so for now we'll just leave it as is and deal with + another issue you might've spotted. +

+

+ We don't modify the db, after fetching a site successfully at no point do + we actually say that we fetched it. Therefore whenever we try and fetch + a new site the program with query the db and find the same route as + before. Thankfully this is a simple fix which just takes adding this line + right after where we index a new site: +

+
+db.Exec(`update urls set indexed = true where url == ?`, site)
+
+

+ Remember when I referenced section 3 part 4 of the software engineers + manual? Well I regret it: +

+
+fetching https://squi.bid/
+fetching https://github.com/EggbertFluffle/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E&source=header
+panic: runtime error: slice bounds out of range [:1] with length 0
+
+goroutine 1 [running]:
+main.deal_html.func1(0xc000315b90)
+        /home/squibid/documents/coding/go/scraper/main.go:40 +0x2fe
+main.deal_html.func1(0x0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0xc0002dd110?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0xc00032e000?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x8c36e0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x7fd180fd9db8?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html({0xc00002a280, 0x7c}, {0x7fd180fd9db8?, 0xc0004bd200?})
+        /home/squibid/documents/coding/go/scraper/main.go:58 +0x11e
+main.main()
+        /home/squibid/documents/coding/go/scraper/main.go:105 +0x12d
+exit status 2
+
+

+ Turns out we need some stability if we want to actually use the code. + This like most bugs is another simple fix which just takes guarding our + url handler with a call to len to make sure we're not doing anything + stupid on empty strings. +

+

+ And now it works! With a small exception, but here's a clean run showing + my lil web crawler doing it's thing... and failing pretty fast. +

+
+fetching https://squi.bid/
+fetching https://eggbert.xyz/
+fetching https://www.linkedin.com/in/harrison-diambrosio-505443229/
+fetching https://github.com/EggbertFluffle/
+fetching https://support.github.com?tags=dotcom-footer
+fetching https://docs.github.com/
+fetching https://services.github.com
+fetching https://github.com/github
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides
+...
+
+

+ I'm sure you can use your imagination to figure out how long that was + going to happen for. This bug would be partially fixed by switching + which site we're searching, but ultimately we wouldn't make it far if we + keep falling for these redirections that just keep going. For now that's + fine though, and I have a semi-working web crawler. All code can be found + here: + git.squi.bid/squibid/web-crawler. + Thank you for reading, I'll probably write a followup when I find some + time. +

+

+ [1] Yes I'm just including more links to make my site a good starting point + why do you ask? +

+ + diff --git a/blog/rss.xml b/blog/rss.xml index 606d470..0e182c0 100644 --- a/blog/rss.xml +++ b/blog/rss.xml @@ -11,6 +11,336 @@ + + 'Writing my own web crawler in go' +https://squi.bid/blog/Writing-my-own-web-crawler-in-go/index.html +https://squi.bid/blog/Writing-my-own-web-crawler-in-go/index.html +Sat, 23 Aug 2025 20:25:26 -0400 + + + 'Writing my own web crawler in go' + + + + +

Writing my own web crawler in go

+

+ I got bored, it happens to everyone (especially software developers). + So like every software developer I've started a new side-project: a web + crawler. This isn't for any actual usecase, I kinda just wanna learn how + to use go and (hopefully) sharpen my SQL skills in the process. At this + point in the writing process I've just started so let me show you what + I've currently gotten working. +

+

+ To start I'm just searching through the hrefs of all <a> tags on a + site and printing them out. On my site that looks like this: +

+
+/
+mailto:me@zacharyscheiman.com
+https://github.com/squibid
+https://codeberg.org/squibid
+https://git.squi.bid/squibid/wiz
+https://git.squi.bid/squibid
+/blog/rss.xml
+/blog/New-Keyboard!
+/blog/Serializing-data-in-C
+/blog/Why-"suckless"-software-is-important
+/blog/What-is-a-squibid
+/blog/librex-and-dots
+/?all_blog
+https://lunarflame.dev
+https://eggbert.xyz/
+		
+

+ Here's the code which fetched this list: +

+
+package main
+import (
+  "fmt"
+  "io"
+  "net/http"
+  "golang.org/x/net/html"
+)
+func deal_html(site string, reader io.Reader) {
+  doc, err := html.Parse(reader)
+  if err != nil {
+    fmt.Println("Error parsing HTML:", err)
+    return
+  }
+  var walk func(n *html.Node)
+  walk = func(n *html.Node) {
+    if n.Type == html.ElementNode && n.Data == "a" {
+      for _, attr := range n.Attr {
+        if attr.Key == "href" {
+          fmt.Println(attr.Val)
+        }
+      }
+    }
+    for c := n.FirstChild; c != nil; c = c.NextSibling {
+      walk(c)
+    }
+  }
+  walk(doc)
+}
+func main() {
+  site := "https://squi.bid/"
+  fmt.Println("fetching " + site)
+  resp, err := http.Get(site)
+  if err != nil {
+    fmt.Println("Error getting the website")
+    return
+  }
+  defer resp.Body.Close()
+  deal_html(site, resp.Body)
+}
+
+

+ After taking a short look at the output of this we need to handle + multiple different "url formats" like the mailto: and / links. For right + now I'm going to respect peoples privacy and not index their email + addresses. +

+
+if len(attr.Val) > 1 && attr.Val[:2] == "//" {
+  fmt.Println("https:" + attr.Val)
+} else if attr.Val[:1] == "/" {
+  if site[len(site) - 1:] == "/" {
+    fmt.Println(site + attr.Val[1:])
+  } else {
+    fmt.Println(site + attr.Val)
+  }
+} else if len(attr.Val) > 4 && attr.Val[:4] == "http" {
+  fmt.Println(attr.Val)
+}
+		
+

+ Now that we've gotten the actual links from the website, it's time to + store and get the links from their sites too. For now I've decided that + because this is already just a toy project I will not be storing all the + info I would if this were a real project. Instead I will only be storing + the link to the site, and a boolean representing whether I'd fetched it's + contents yet. So, let's go impl... +

+

+ Step #1 is to find a sql library. I just went with Go's built in + database/sql mappings, and then visited + golang.org/s/sqldrivers + and decided on + github.com/mattn/go-sqlite3 + because sqlite is a name I'm familiar with, and I really don't want to go + through the hassle of looking into different dbs for a toy project. + [1] +

+

+ Now that we've chosen our db I'll setup our table like I mentioned + earlier, with one string and one boolean: +

+
+db, err = sql.Open("sqlite3", "./sites.db")
+if err != nil {
+  log.Fatal(err)
+}
+defer db.Close()
+db.Exec(`
+    create table if not exists
+    urls (url text not null primary key, indexed boolean not null);
+    `)
+
+

+ Now we need to start adding entries to the db. To do this I wanted to + ensure I wouldn't end up shooting myself in the foot therefore I decided + to go with a small tiny function to make it a teensy tiny bit safer: +

+
+func db_insert_url(url string, seen bool) {
+  db.Exec(`insert into urls values (?, ?)`, url, seen)
+}
+
+

+ It could use some error handling, but if you look at section 3 part 4 of + the software engineers manual it reads "side projects aren't stable + because if it's stable it's not a side project". +

+

+ Now that we've gotten all the easy stuff out of the way it's time to work + on making this run forever, or close to it at least. For now I'm going to + keep this project in an inefficient state and we're not going to use any + worker pools or something fancy like that. To get started we first need + to make a decision: depth or breadth first searching? incase you're not + sure what I mean by this let me give you an example: +

+

+ Let's say we have site example-a.com which contains the following links: +

+
    +
  • example-a.com/blog
  • +
  • example-b.com
  • +
  • example-c.com
  • +
+

+ With a breadth first search we would first go to either example-b or + example-c wheras with a depth first search we would go with + example-a.com/blog. For my use case I want to find as many sites as + possible therefore I will be targeting sites with other base urls. +

+

+ Now that we know how we want to decide the next url to fetch let's impl + the loop which handles this. +

+
+for i := 0;; i++ {
+  if i > 0 {
+    rows, err := db.Query(`select url from urls where indexed is false`)
+    if err != nil {
+      return
+    }
+    for rows.Next() {
+      var test string
+      rows.Scan(&test)
+      site = test
+      /* we can't just check if the site is the same because then when we're
+       * checking squi.bid/example it won't register squi.bid as the same
+       * domain, although maybe that's what we want.
+       */
+      if !strings.Contains(test, site) {
+        break
+      }
+    }
+    rows.Close()
+  }
+  fmt.Println("fetching " + site)
+  resp, err := http.Get(site)
+  if err != nil {
+    fmt.Println("Error getting", site)
+    os.Exit(1)
+  }
+  deal_html(site, resp.Body)
+  resp.Body.Close()
+}
+
+

+ If you read through my code you might've seen the comment about how our + check doesn't actually prevent accessing the same site, the solution I'm + currently thinking of is to add a column to the db which keeps the + highest point in the site for example: squi.bid/example/1/2/3/4 would + have a highest point of squi.bid. But currently this isn't something I'm + too concerned about so for now we'll just leave it as is and deal with + another issue you might've spotted. +

+

+ We don't modify the db, after fetching a site successfully at no point do + we actually say that we fetched it. Therefore whenever we try and fetch + a new site the program with query the db and find the same route as + before. Thankfully this is a simple fix which just takes adding this line + right after where we index a new site: +

+
+db.Exec(`update urls set indexed = true where url == ?`, site)
+
+

+ Remember when I referenced section 3 part 4 of the software engineers + manual? Well I regret it: +

+
+fetching https://squi.bid/
+fetching https://github.com/EggbertFluffle/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E&source=header
+panic: runtime error: slice bounds out of range [:1] with length 0
+goroutine 1 [running]:
+main.deal_html.func1(0xc000315b90)
+        /home/squibid/documents/coding/go/scraper/main.go:40 +0x2fe
+main.deal_html.func1(0x0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0xc0002dd110?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0xc00032e000?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x8c36e0?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html.func1(0x7fd180fd9db8?)
+        /home/squibid/documents/coding/go/scraper/main.go:55 +0x83
+main.deal_html({0xc00002a280, 0x7c}, {0x7fd180fd9db8?, 0xc0004bd200?})
+        /home/squibid/documents/coding/go/scraper/main.go:58 +0x11e
+main.main()
+        /home/squibid/documents/coding/go/scraper/main.go:105 +0x12d
+exit status 2
+
+

+ Turns out we need some stability if we want to actually use the code. + This like most bugs is another simple fix which just takes guarding our + url handler with a call to len to make sure we're not doing anything + stupid on empty strings. +

+

+ And now it works! With a small exception, but here's a clean run showing + my lil web crawler doing it's thing... and failing pretty fast. +

+
+fetching https://squi.bid/
+fetching https://eggbert.xyz/
+fetching https://www.linkedin.com/in/harrison-diambrosio-505443229/
+fetching https://github.com/EggbertFluffle/
+fetching https://support.github.com?tags=dotcom-footer
+fetching https://docs.github.com/
+fetching https://services.github.com
+fetching https://github.com/github
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides
+fetching https://github.com/github/search?q=topic%3Aactions+org%3Agithub+fork%3Atrue&type=repositories/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides/git-guides
+...
+
+

+ I'm sure you can use your imagination to figure out how long that was + going to happen for. This bug would be partially fixed by switching + which site we're searching, but ultimately we wouldn't make it far if we + keep falling for these redirections that just keep going. For now that's + fine though, and I have a semi-working web crawler. All code can be found + here: + git.squi.bid/squibid/web-crawler. + Thank you for reading, I'll probably write a followup when I find some + time. +

+

+ [1] Yes I'm just including more links to make my site a good starting point + why do you ask? +

+ +]]>
+
+ + 'New Keyboard!' https://squi.bid/blog/New-Keyboard!/index.html