-
-
[原创]爬虫学习-GitHub安全公告页爬取
-
发表于: 2023-4-25 11:10 792
-
学习爬虫编写,通过xpath从 https://github.com/advisories获取公告内容
获取到信息后可以将标题和内容交给chatgpt取名,再翻译内容,在没有cvss评分的时候,将low、Moderate、High、Critical等级交给chatgpt打分
爬虫部分效果
chatgpt效果,有的时候不太理想,打分部分未调试,免费领的key过期了。。
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | from lxml import etree import requests import openai import re from bs4 import BeautifulSoup import time import csv ''' #旧,使用BeautifulSoup获取公告链接,在公告页面判断公告是否被去除,慢很多 def github_URL_list(): url = "https://github.com/advisories" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") url_list = [] for link in soup.find_all("a", {"class": "Link--primary"}): if "/advisories/" in link.get("href"): url = "https://github.com" + link.get("href") if advisories_true(url) : url_list.append(url) print(url_list) return url_list def advisories_true(advisories_url): page = requests.get(advisories_url) html = etree.HTML(page.content) not_withdraw = html.xpath("//main/div/div[2]/span/text()") if not_withdraw != "Withdrawn" : return True else: return False ''' #从https://github.com/advisories获取公告,并排除掉被撤回(withdrawn)的公告 def github_url_list_true(): url_list = [] for i in range ( 1 , 3 ): url = "https://github.com/advisories?page=" + str (i) page = requests.get(url) html = etree.HTML(page.content) sections = html.xpath( "/html/body/div[1]/div/main/div/div[2]/div[2]/div[1]/div[position()>1]" ) time.sleep( 3 ) for section in sections: section_obj = etree.HTML(etree.tostring(section).decode()) withdraw = section_obj.xpath( "//div/span[@title='Label: withdrawn']/text()" )[ 0 ] if section_obj.xpath( "//div/span[@title='Label: withdrawn']/text()" ) else "" if withdraw.strip() = = "withdrawn" : continue advisories_url = section_obj.xpath( "//div/a/@href" )[ 0 ] if section_obj.xpath( "//div/a/@href" ) else "" advisories_url = "https://github.com" + advisories_url url_list.append(advisories_url) return url_list #获取项目的star数 def git_cve_star(advisories_url): advisories_url_page = requests.get(advisories_url) advisories_url_html = etree.HTML(advisories_url_page.content) github_code_url = advisories_url_html.xpath( "/html/body/div[1]/div/main/div/div[2]/div[2]/div[5]/div/a/@href" )[ 0 ] if advisories_url_html.xpath( "/html/body/div[1]/div/main/div/div[2]/div[2]/div[5]/div/a/@href" ) else "" github_link = re.search( "https://github.com/" , advisories_url) if github_link: Source_code_url_page = requests.get(github_code_url) soup = BeautifulSoup(Source_code_url_page.text, "html.parser" ) element = soup.find( 'span' , { 'id' : 'repo-stars-counter-star' }) value = element.text else : value = 'none' return value #在公告页面获取信息 def github_cve_detail(advisories_url): page = requests.get(advisories_url) html = etree.HTML(page.content) sections = html.xpath( "/html/body/div[1]" ) for section in sections: section_obj = etree.HTML(etree.tostring(section).decode()) result = {} CVEid = section_obj.xpath( "//main/div/div[2]/div[2]/div[3]/div/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div[2]/div[3]/div/text()" ) else "" if CVEid.strip() ! = "No known CVE" : result[ 'CVEid' ] = CVEid.strip() else : result[ 'CVEid' ] = "" title = section_obj.xpath( "//main/div/div[1]/h2/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[1]/h2/text()" ) else "" result[ 'title' ] = title.strip() #CWEtext_elements = section_obj.xpath("//main/div/div[2]/div[2]/div[2]/div/a[@data-hovercard-type='cwe']/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[2]/div/a[@data-hovercard-type='cwe']/text()") else "" CWEtext_elements = section_obj.xpath( "//main/div/div/div[2]/div[2]/div" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div/div[2]/div" ) else "" cwe_list = [] #无CWE时 if CWEtext_elements.xpath( "/text()" ) ! = "No CWEs" : CWEtext_x = CWEtext_elements.xpath( "//a[@data-hovercard-type='cwe']/text()" ) for CWEtext in CWEtext_x: cwe_list.append(CWEtext.strip()) result[ 'CWE' ] = cwe_list else : result[ 'CWE' ] = "" cvss_score_Label = section_obj.xpath( "//span[contains(@class, 'Label Label--')]/text()" )[ 0 ] if section_obj.xpath( "//span[contains(@class, 'Label Label--')]/text()" ) else "" result[ 'cvss_score_Label' ] = cvss_score_Label.replace( 'severity' ,'').strip() cvss_score = section_obj.xpath( "//main/div/div[2]/div[2]/div[1]/div[1]/div/div/span[@class='tooltipped tooltipped-n tooltipped-no-delay tooltipped-multiline']/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div[2]/div[1]/div[1]/div/div/span[@class='tooltipped tooltipped-n tooltipped-no-delay tooltipped-multiline']/text()" ) else "" match = re.search(r "\d+\.\d+|\d" , cvss_score) if match: cvss_score_1 = match.group( 0 ) result[ 'cvss_score' ] = cvss_score_1 else : print ( "cvss_score_1未找到" ) result[ 'cvss_score' ] = "" cvss_3_1 = section_obj.xpath( "//main/div/div[2]/div[2]/div[1]/div[3]/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div[2]/div[1]/div[3]/text()" ) else "" match = re.search(r "CVSS:3.1/(.*)" , cvss_3_1) if match: cvss_3_1_1 = match.group( 1 ) result[ 'cvss_3_1' ] = cvss_3_1_1 else : print ( "cvss_3_1_1未找到" ) result[ 'cvss_3_1' ] = "" result[ 'time' ] = section_obj.xpath( "//main/div/div[1]/div/span[3]/relative-time[1]/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[1]/div/span[3]/relative-time[1]/text()" ) else "" result[ 'Affected' ] = section_obj.xpath( "//main/div/div[2]/div[1]/div[1]/div/div/div[2]/div/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div[1]/div[1]/div/div/div[2]/div/text()" ) else "" result[ 'Patched' ] = section_obj.xpath( "//main/div/div[2]/div[1]/div[1]/div/div/div[3]/div/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div[1]/div[1]/div/div/div[3]/div/text()" ) else "" result[ 'Description' ] = section_obj.xpath( "//main/div/div[2]/div[1]/div[2]/div[2]/div/p/text()" )[ 0 ] if section_obj.xpath( "//main/div/div[2]/div[1]/div[2]/div[2]/div/p" ) else "" result[ 'url' ] = advisories_url result[ 'star' ] = git_cve_star(advisories_url) print (result) #chatgpt_cve(result) return result def chatgpt_cve(result): openai.api_key = '' proxies = { 'http' : " ", 'https': " "} openai.proxy = proxies messages = [] # 通过 `系统(system)` 角色给 `助手(assistant)` 角色赋予一个人设 #messages.append({'role': 'system', 'content': '你是一个信息安全工程师。'}) messages.append({ 'role' : 'user' , 'content' : '''根据标题和描述对漏洞命名并翻译描述,命名格式为"产品名xx漏洞",xx为漏洞类型,漏洞二字结尾;输出格式为"名称:\n描述:"。''' + '标题: ' + result[' title '] + ' 描述: ' + result[' Description'] }) # 调用接口 response = openai.ChatCompletion.create( model = 'gpt-3.5-turbo' , messages = messages, ) # 在 messages 中加入 `助手(assistant)` 的回答 messages.append({ 'role' : response[ 'choices' ][ 0 ][ 'message' ][ 'role' ], 'content' : response[ 'choices' ][ 0 ][ 'message' ][ 'content' ], }) print (messages[ 1 ]) match_name = re.search(r "名称:(.*?)\n描述" , messages[ 1 ][ 'content' ]) if match_name: name = match_name.group( 1 ) print (name) if result[ 'CVEid' ] ! = "" : result[ 'name' ] = name + "(" + result[ 'CVEid' ] + ")" else : result[ 'name' ] = name else : print ( "name未找到" ) match_description = re.search(r "描述:(.*)" , messages[ 1 ][ 'content' ]) if match_description: description = match_description.group( 1 ) result[ 'Description' ] = description else : print ( "description未找到" ) result[ 'GPT_cvss3_1' ] = "" if result[ 'cvss_score' ] ! = "": return messages.append({ 'role' : 'user' , 'content' : '''根据标题、描述、严重程度对漏洞评出cvss3.1的基本分数及其cvss3向量,输出格式为。CVSS_3.1:1.2\n AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:N/A:N。''' + '标题: ' + result[' title '] + ' 描述: ' + result[' Description '] + ' 严重程度: ' + result[' cvss_score_Label']}) response = openai.ChatCompletion.create( model = 'gpt-3.5-turbo' , messages = messages, ) messages.append({ 'role' : response[ 'choices' ][ 0 ][ 'message' ][ 'role' ], 'content' : response[ 'choices' ][ 0 ][ 'message' ][ 'content' ], }) result[ 'GPT_cvss3_1' ] = messages[ 1 ][ 'content' ] print (messages) def save_csv(results): with open ( 'data.csv' , 'w' , newline = '') as csvfile: fieldnames = results[ 0 ] writer = csv.DictWriter(csvfile, fieldnames = fieldnames) writer.writeheader() for result in results: writer.writerow(result) if __name__ = = "__main__" : url_list = github_url_list_true() results = [] for Url_i in url_list: print ( "----------------------------" ) print (Url_i) time.sleep( 5 ) results.append(github_cve_detail(Url_i)) save_csv(results) #print(results) ''' url = "https://github.com/advisories/GHSA-f5v5-ccqc-6w36" #print(github_cve_detail(url)) git_cve_star(url) #chatgpt_cve() # print(github_url_list_true()) ''' |
[招生]科锐逆向工程师培训(2024年11月15日实地,远程教学同时开班, 第51期)
赞赏记录
参与人
雪币
留言
时间
StardyGod
为你点赞~
2023-7-27 10:05
hml189
为你点赞~
2023-5-5 15:38
704088
为你点赞~
2023-4-25 14:03
赞赏
他的文章
看原图
赞赏
雪币:
留言: