Files
web-crawler/main.go
2025-08-23 20:24:23 -04:00

106 lines
2.1 KiB
Go

package main
import (
"database/sql"
"fmt"
"io"
"log"
"net/http"
"strings"
_ "github.com/mattn/go-sqlite3"
"golang.org/x/net/html"
)
var db *sql.DB
func db_insert_url(url string, seen bool) {
db.Exec(`insert into urls values (?, ?)`, url, seen)
}
func deal_html(site string, reader io.Reader) {
doc, err := html.Parse(reader)
if err != nil {
fmt.Println("Error parsing HTML:", err)
return
}
var insert_url string
var walk func(n *html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" && len(attr.Val) > 1 {
if attr.Val[:2] == "//" {
insert_url = "https:" + attr.Val
} else if attr.Val[:1] == "/" {
if site[len(site) - 1:] == "/" {
insert_url = site + attr.Val[1:]
} else {
insert_url = site + attr.Val
}
} else if len(attr.Val) > 4 && attr.Val[:4] == "http" {
insert_url = attr.Val
}
db_insert_url(insert_url, site == insert_url)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
}
func main() {
var err any
site := "https://squi.bid/"
db, err = sql.Open("sqlite3", "./sites.db")
if err != nil {
log.Fatal(err)
}
defer db.Close()
db.Exec(`
create table if not exists
urls (url text not null primary key, indexed boolean not null);
`)
/* start fetching the sites */
for i := 0;; i++ {
if i > 0 {
rows, err := db.Query(`select url from urls where indexed is false`)
if err != nil {
return
}
for rows.Next() {
var test string
rows.Scan(&test)
site = test
/* we can't just check if the site is the same because then when we're
* checking squi.bid/example it won't register squi.bid as the same
* domain, although maybe that's what we want.
*/
if !strings.Contains(test, site) {
break
}
}
rows.Close()
}
fmt.Println("fetching " + site)
resp, err := http.Get(site)
if err != nil {
fmt.Println("Error getting", site)
continue
}
deal_html(site, resp.Body)
db.Exec(`update urls set indexed = true where url == ?`, site)
resp.Body.Close()
}
}