当前位置: 首页 > news >正文

【网站内容安全检测】之1:获取网站所有链接sitemap数据

不多BB,直接上代码:
main.go

package mainimport ("bufio""crypto/tls""fmt""io""net/http""net/url""os""strings""sync""time"_ "net/http/pprof""log""github.com/PuerkitoBio/goquery""github.com/schollz/progressbar/v3"
)type WebCrawler struct {startURLs     []stringbaseDomains   map[string]boolvisitedURLs   sync.MapurlsToVisit   chan stringsemaphore     chan struct{}timeout       time.DurationverifySSL     boolclient        *http.ClientprogressBar   *progressbar.ProgressBarwg            sync.WaitGroup
}func NewWebCrawler(startURLs []string, maxConnections int, timeout int, verifySSL bool) *WebCrawler {baseDomains := make(map[string]bool)for _, u := range startURLs {parsed, _ := url.Parse(u)baseDomains[parsed.Host] = true}return &WebCrawler{startURLs:   startURLs,baseDomains: baseDomains,urlsToVisit: make(chan string, 1000),semaphore:   make(chan struct{}, maxConnections),timeout:     time.Duration(timeout) * time.Second,verifySSL:   verifySSL,}
}func (c *WebCrawler) initClient() {tr := &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: !c.verifySSL},}c.client = &http.Client{Timeout:   c.timeout,Transport: tr,}
}func (c *WebCrawler) normalizeURL(rawURL string, baseURL string) (string, error) {base, err := url.Parse(baseURL)if err != nil || base == nil {return "", fmt.Errorf("invalid base URL: %v", err)}u, err := url.Parse(rawURL)if err != nil || u == nil {return "", fmt.Errorf("invalid URL: %v", err)}return base.ResolveReference(u).String(), nil
}func (c *WebCrawler) isValidURL(rawURL string) bool {parsed, err := url.Parse(rawURL)if err != nil {return false}if parsed.Scheme != "http" && parsed.Scheme != "https" {return false}if !c.baseDomains[parsed.Host] {return false}extensions := []string{".jpg", ".jpeg", ".png", ".gif", ".pdf", ".zip"}for _, ext := range extensions {if strings.HasSuffix(strings.ToLower(parsed.Path), ext) {return false}}return true
}func (c *WebCrawler) fetchURL(url string) (string, error) {c.semaphore <- struct{}{}defer func() { <-c.semaphore }()req, err := http.NewRequest("GET", url, nil)if err != nil {return "", fmt.Errorf("request creation failed: %v", err)}req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")resp, err := c.client.Do(req)if err != nil {if strings.Contains(err.Error(), "no such host") {return "", fmt.Errorf("DNS lookup failed for %s", url)}return "", fmt.Errorf("request failed: %v", err)}defer resp.Body.Close()if resp.StatusCode != 200 {return "", fmt.Errorf("non-200 status: %d", resp.StatusCode)}if !strings.Contains(resp.Header.Get("Content-Type"), "text/html") {return "", fmt.Errorf("non-HTML content type: %s", resp.Header.Get("Content-Type"))}body, err := io.ReadAll(resp.Body)if err != nil {return "", fmt.Errorf("error reading response: %v", err)}return string(body), nil
}func (c *WebCrawler) parseLinks(html string, baseURL string) []string {doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))if err != nil {log.Printf("Error parsing HTML: %v", err)return nil}var links []stringdoc.Find("a[href]").Each(func(i int, s *goquery.Selection) {href, exists := s.Attr("href")if !exists || strings.HasPrefix(href, "javascript:") || href == "#" {return}normalized, err := c.normalizeURL(href, baseURL)if err != nil {log.Printf("Error normalizing URL %s: %v", href, err)return}if c.isValidURL(normalized) {links = append(links, normalized)}})doc.Find("[src]").Each(func(i int, s *goquery.Selection) {src, exists := s.Attr("src")if !exists || strings.HasPrefix(src, "data:") {return}normalized, err := c.normalizeURL(src, baseURL)if err != nil {log.Printf("Error normalizing URL %s: %v", src, err)return}if c.isValidURL(normalized) {links = append(links, normalized)}})return links
}func (c *WebCrawler) processURL(url string) {defer c.wg.Done()if _, exists := c.visitedURLs.Load(url); exists {return}c.visitedURLs.Store(url, true)html, err := c.fetchURL(url)if err != nil {fmt.Printf("Error fetching %s: %v\n", url, err)return}newLinks := c.parseLinks(html, url)for _, link := range newLinks {if _, exists := c.visitedURLs.Load(link); !exists {c.urlsToVisit <- link}}if c.progressBar != nil {c.progressBar.Add(1)}
}func (c *WebCrawler) crawl() {c.initClient()c.progressBar = progressbar.Default(-1, "爬取进度")defer c.progressBar.Close()for _, url := range c.startURLs {c.wg.Add(1)go c.processURL(url)}go func() {for newURL := range c.urlsToVisit {if _, exists := c.visitedURLs.Load(newURL); !exists {c.wg.Add(1)go c.processURL(newURL)}}}()c.wg.Wait()
}func (c *WebCrawler) saveResults(filename string) {file, err := os.Create(filename)if err != nil {fmt.Printf("Error creating file: %v\n", err)return}defer file.Close()c.visitedURLs.Range(func(key, _ interface{}) bool {file.WriteString(key.(string) + "\n")return true})
}func (c *WebCrawler) run() {startTime := time.Now()c.crawl()elapsed := time.Since(startTime)fmt.Printf("\n爬取完成!\n")// 修复语法错误:添加缺少的括号和逗号visitedCount := 0c.visitedURLs.Range(func(key, _ interface{}) bool {visitedCount++return true})fmt.Printf("共爬取 %d 个URL\n", visitedCount)fmt.Printf("用时: %.2f 秒\n", elapsed.Seconds())outputFile := "multi_domain_links.txt"c.saveResults(outputFile)fmt.Printf("结果已保存到 %s\n", outputFile)
}func main() {go func() {log.Println(http.ListenAndServe("localhost:6060", nil))}()if len(os.Args) < 2 {fmt.Println("用法: go run web_crawler.go <URL文件路径> [verify_ssl]")fmt.Println("例如: go run web_crawler.go urls.txt")fmt.Println("或: go run web_crawler.go urls.txt true")return}urlFile := os.Args[1]file, err := os.Open(urlFile)if err != nil {fmt.Printf("错误:文件 %s 不存在\n", urlFile)return}defer file.Close()var startURLs []stringscanner := bufio.NewScanner(file)for scanner.Scan() {if url := strings.TrimSpace(scanner.Text()); url != "" {startURLs = append(startURLs, url)}}if len(startURLs) == 0 {fmt.Println("错误:URL文件为空")return}verifySSL := falseif len(os.Args) > 2 {verifySSL = os.Args[2] == "true"}crawler := NewWebCrawler(startURLs, 50, 20, verifySSL)// 添加开始运行提示fmt.Printf("开始爬取网站,起始URL数量: %d,是否验证SSL: %v\n", len(startURLs), verifySSL)crawler.run()
}

go.mod

module webcrawlergo 1.24.4require (github.com/PuerkitoBio/goquery v1.10.3 // indirectgithub.com/andybalholm/cascadia v1.3.3 // indirectgithub.com/mattn/go-sqlite3 v1.14.28 // indirectgithub.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirectgithub.com/rivo/uniseg v0.4.7 // indirectgithub.com/schollz/progressbar/v3 v3.18.0 // indirectgolang.org/x/net v0.39.0 // indirectgolang.org/x/sys v0.32.0 // indirectgolang.org/x/term v0.31.0 // indirect
)	

domains.txt

www.网址.com
www.网址2.com

运行命令

go run web_crawler.go .\domains.txt

结束后会自动将结果生成到当前目录中

http://www.lqws.cn/news/518347.html

相关文章:

  • Web3D技术协议的AI革命:生成式模型如何改写交互标准?
  • 操作系统之内存管理(王道)
  • LeeCode349. 两个数的交集
  • 基于大模型的甲状腺结节预测及综合诊疗技术方案大纲
  • 防火墙快速管理软件,66K超小巧
  • Java 日志框架选型:SLF4J + Logback vs. Log4j2 的深度解析
  • iClone 中创建的面部动画导入 Daz 3D
  • Spring AOP 中有多个切面时执行顺序是怎样的?
  • Android14音频子系统-Audio HAL分析
  • 南北差异之——跨端理解能力
  • sql格式化自动识别SQL语法结构
  • gsql: command not found
  • OpenLayers 上传Shapefile文件
  • 基于 Python 的批量文件重命名软件设计与实现
  • 智哪儿专访 | Matter中国提速:开放标准如何破局智能家居“生态孤岛”?
  • 舵机在智能家居里的应用
  • 第k个数字
  • 归并排序算法
  • 企业内部安全组网技术解析:安全通道选型、零信任架构与数据合规加密防护
  • 计算机网络-----详解HTTP协议
  • 基于springboot+vue的智慧农业专家远程指导系统
  • 苹果签名应用掉签频繁原因排查,以及如何避免
  • Mysql使用窗口函数查询
  • 左神算法之有序二维矩阵中的目标值查找
  • vscode管理go多个版本
  • 英飞凌高性能BMS解决方案助力汽车电动化
  • 【世纪龙科技】新能源汽车VR虚拟体验展示馆-解锁认知新维度
  • 灰度发布怎么保证数据库一致的
  • AES加密:为你的PDF文档加上一道钢铁防线
  • Kubernetes、Docker Swarm 与 Nomad 容器编排方案深度对比与选型指导