web-crawler/main.go

package main

import (
	"database/sql"
	"fmt"
	"io"
	"log"
	"net/http"
	"strings"

	_ "github.com/mattn/go-sqlite3"
	"golang.org/x/net/html"
)

var db *sql.DB

func db_insert_url(url string, seen bool) {
	db.Exec(`insert into urls values (?, ?)`, url, seen)
}

func deal_html(site string, reader io.Reader) {
	doc, err := html.Parse(reader)
	if err != nil {
		fmt.Println("Error parsing HTML:", err)
		return
	}

	var insert_url string
	var walk func(n *html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, attr := range n.Attr {
				if attr.Key == "href" && len(attr.Val) > 1 {
					if attr.Val[:2] == "//" {
						insert_url = "https:" + attr.Val
					} else if attr.Val[:1] == "/" {
						if site[len(site) - 1:] == "/" {
							insert_url = site + attr.Val[1:]
						} else {
							insert_url = site + attr.Val
						}
					} else if len(attr.Val) > 4 && attr.Val[:4] == "http" {
						insert_url = attr.Val
					}
					db_insert_url(insert_url, site == insert_url)
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)
}

func main() {
	var err any

	site := "https://squi.bid/"
	db, err = sql.Open("sqlite3", "./sites.db")
	if err != nil {
		log.Fatal(err)
	}
	defer db.Close()
	db.Exec(`
		create table if not exists
		urls (url text not null primary key, indexed boolean not null);
		`)

	/* start fetching the sites */
	for i := 0;; i++ {
		if i > 0 {
			rows, err := db.Query(`select url from urls where indexed is false`)
			if err != nil {
				return
			}

			for	rows.Next() {
				var test string
				rows.Scan(&test)
				site = test
				/* we can't just check if the site is the same because then when we're
				 * checking squi.bid/example it won't register squi.bid as the same
				 * domain, although maybe that's what we want.
				 */
				if !strings.Contains(test, site) {
					break
				}
			}
			rows.Close()
		}

		fmt.Println("fetching " + site)
		resp, err := http.Get(site)
		if err != nil {
			fmt.Println("Error getting", site)
			continue
		}

		deal_html(site, resp.Body)
		db.Exec(`update urls set indexed = true where url == ?`, site)

		resp.Body.Close()
	}
}