From 77d8181ef94411d944d08be2afd01d512449904a Mon Sep 17 00:00:00 2001 From: Squibid Date: Sat, 23 Aug 2025 20:24:23 -0400 Subject: [PATCH] inital commit --- .gitignore | 1 + go.mod | 6 +++ go.sum | 4 ++ main.go | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 .gitignore create mode 100644 go.mod create mode 100644 go.sum create mode 100644 main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9be7342 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*sites.db* diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..cc1d5b8 --- /dev/null +++ b/go.mod @@ -0,0 +1,6 @@ +module squibid/scraper + +go 1.24.5 + +require golang.org/x/net v0.43.0 +require github.com/mattn/go-sqlite3 v1.14.32 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3dc33f3 --- /dev/null +++ b/go.sum @@ -0,0 +1,4 @@ +github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= +github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= diff --git a/main.go b/main.go new file mode 100644 index 0000000..e2f8d31 --- /dev/null +++ b/main.go @@ -0,0 +1,105 @@ +package main + +import ( + "database/sql" + "fmt" + "io" + "log" + "net/http" + "strings" + + _ "github.com/mattn/go-sqlite3" + "golang.org/x/net/html" +) + +var db *sql.DB + +func db_insert_url(url string, seen bool) { + db.Exec(`insert into urls values (?, ?)`, url, seen) +} + +func deal_html(site string, reader io.Reader) { + doc, err := html.Parse(reader) + if err != nil { + fmt.Println("Error parsing HTML:", err) + return + } + + var insert_url string + var walk func(n *html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + for _, attr := range n.Attr { + if attr.Key == "href" && len(attr.Val) > 1 { + if attr.Val[:2] == "//" { + insert_url = "https:" + attr.Val + } else if attr.Val[:1] == "/" { + if site[len(site) - 1:] == "/" { + insert_url = site + attr.Val[1:] + } else { + insert_url = site + attr.Val + } + } else if len(attr.Val) > 4 && attr.Val[:4] == "http" { + insert_url = attr.Val + } + db_insert_url(insert_url, site == insert_url) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(doc) +} + +func main() { + var err any + + site := "https://squi.bid/" + db, err = sql.Open("sqlite3", "./sites.db") + if err != nil { + log.Fatal(err) + } + defer db.Close() + db.Exec(` + create table if not exists + urls (url text not null primary key, indexed boolean not null); + `) + + /* start fetching the sites */ + for i := 0;; i++ { + if i > 0 { + rows, err := db.Query(`select url from urls where indexed is false`) + if err != nil { + return + } + + for rows.Next() { + var test string + rows.Scan(&test) + site = test + /* we can't just check if the site is the same because then when we're + * checking squi.bid/example it won't register squi.bid as the same + * domain, although maybe that's what we want. + */ + if !strings.Contains(test, site) { + break + } + } + rows.Close() + } + + fmt.Println("fetching " + site) + resp, err := http.Get(site) + if err != nil { + fmt.Println("Error getting", site) + continue + } + + deal_html(site, resp.Body) + db.Exec(`update urls set indexed = true where url == ?`, site) + + resp.Body.Close() + } +}