package main import ( "database/sql" "fmt" "io" "log" "net/http" "strings" _ "github.com/mattn/go-sqlite3" "golang.org/x/net/html" ) var db *sql.DB func db_insert_url(url string, seen bool) { db.Exec(`insert into urls values (?, ?)`, url, seen) } func deal_html(site string, reader io.Reader) { doc, err := html.Parse(reader) if err != nil { fmt.Println("Error parsing HTML:", err) return } var insert_url string var walk func(n *html.Node) walk = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, attr := range n.Attr { if attr.Key == "href" && len(attr.Val) > 1 { if attr.Val[:2] == "//" { insert_url = "https:" + attr.Val } else if attr.Val[:1] == "/" { if site[len(site) - 1:] == "/" { insert_url = site + attr.Val[1:] } else { insert_url = site + attr.Val } } else if len(attr.Val) > 4 && attr.Val[:4] == "http" { insert_url = attr.Val } db_insert_url(insert_url, site == insert_url) } } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) } func main() { var err any site := "https://squi.bid/" db, err = sql.Open("sqlite3", "./sites.db") if err != nil { log.Fatal(err) } defer db.Close() db.Exec(` create table if not exists urls (url text not null primary key, indexed boolean not null); `) /* start fetching the sites */ for i := 0;; i++ { if i > 0 { rows, err := db.Query(`select url from urls where indexed is false`) if err != nil { return } for rows.Next() { var test string rows.Scan(&test) site = test /* we can't just check if the site is the same because then when we're * checking squi.bid/example it won't register squi.bid as the same * domain, although maybe that's what we want. */ if !strings.Contains(test, site) { break } } rows.Close() } fmt.Println("fetching " + site) resp, err := http.Get(site) if err != nil { fmt.Println("Error getting", site) continue } deal_html(site, resp.Body) db.Exec(`update urls set indexed = true where url == ?`, site) resp.Body.Close() } }