首页
社区
课程
招聘
[原创]爬虫学习-GitHub安全公告页爬取
发表于: 2023-4-25 11:10 792

[原创]爬虫学习-GitHub安全公告页爬取

2023-4-25 11:10
792

学习爬虫编写,通过xpath从 https://github.com/advisories获取公告内容
图片描述
获取到信息后可以将标题和内容交给chatgpt取名,再翻译内容,在没有cvss评分的时候,将low、Moderate、High、Critical等级交给chatgpt打分
爬虫部分效果
图片描述
chatgpt效果,有的时候不太理想,打分部分未调试,免费领的key过期了。。
图片描述
图片描述
代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
from lxml import etree
import requests
import openai
import re
from bs4 import BeautifulSoup
import time
import csv
 
'''
#旧,使用BeautifulSoup获取公告链接,在公告页面判断公告是否被去除,慢很多
def github_URL_list():
    url = "https://github.com/advisories"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    url_list = []
 
    for link in soup.find_all("a", {"class": "Link--primary"}):
        if "/advisories/" in link.get("href"):
            url = "https://github.com" + link.get("href")
            if advisories_true(url) :
                url_list.append(url)
    print(url_list)
    return url_list
 
def advisories_true(advisories_url):
    page = requests.get(advisories_url)
    html = etree.HTML(page.content)
    not_withdraw = html.xpath("//main/div/div[2]/span/text()")
    if not_withdraw != "Withdrawn" :
        return True
    else:
        return False  
'''
 
#从https://github.com/advisories获取公告,并排除掉被撤回(withdrawn)的公告
def github_url_list_true():
    url_list = []
    for i in range(1,3):
 
        url = "https://github.com/advisories?page=" + str(i)
 
        page = requests.get(url)
        html = etree.HTML(page.content)
        sections = html.xpath("/html/body/div[1]/div/main/div/div[2]/div[2]/div[1]/div[position()>1]")
 
        time.sleep(3)
 
        for section in sections:
            section_obj = etree.HTML(etree.tostring(section).decode())
 
            withdraw = section_obj.xpath("//div/span[@title='Label: withdrawn']/text()")[0] if section_obj.xpath("//div/span[@title='Label: withdrawn']/text()") else ""
 
            if withdraw.strip() =="withdrawn":
                continue
 
            advisories_url = section_obj.xpath("//div/a/@href")[0] if section_obj.xpath("//div/a/@href") else ""
            advisories_url = "https://github.com" + advisories_url
            url_list.append(advisories_url)
    return url_list
 
 
 
 
#获取项目的star数
def git_cve_star(advisories_url):
    advisories_url_page = requests.get(advisories_url)
    advisories_url_html = etree.HTML(advisories_url_page.content)
    github_code_url = advisories_url_html.xpath("/html/body/div[1]/div/main/div/div[2]/div[2]/div[5]/div/a/@href")[0] if advisories_url_html.xpath("/html/body/div[1]/div/main/div/div[2]/div[2]/div[5]/div/a/@href") else ""
    github_link = re.search("https://github.com/", advisories_url)
    if github_link:
        Source_code_url_page = requests.get(github_code_url)
        soup = BeautifulSoup(Source_code_url_page.text, "html.parser")
        element = soup.find('span', {'id': 'repo-stars-counter-star'})
        value = element.text
    else:
        value = 'none'
    return value
 
#在公告页面获取信息
def github_cve_detail(advisories_url):
 
    page = requests.get(advisories_url)
    html = etree.HTML(page.content)
    sections = html.xpath("/html/body/div[1]")
 
    for section in sections:
 
        section_obj = etree.HTML(etree.tostring(section).decode())
        result = {}
 
        CVEid = section_obj.xpath("//main/div/div[2]/div[2]/div[3]/div/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[3]/div/text()") else ""
        if CVEid.strip() != "No known CVE" :
            result['CVEid'] = CVEid.strip()
        else :
            result['CVEid'] = ""
 
        title = section_obj.xpath("//main/div/div[1]/h2/text()")[0] if section_obj.xpath("//main/div/div[1]/h2/text()") else ""
        result['title'] = title.strip()
 
        #CWEtext_elements = section_obj.xpath("//main/div/div[2]/div[2]/div[2]/div/a[@data-hovercard-type='cwe']/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[2]/div/a[@data-hovercard-type='cwe']/text()") else ""
        CWEtext_elements = section_obj.xpath("//main/div/div/div[2]/div[2]/div")[0] if section_obj.xpath("//main/div/div[2]/div/div[2]/div") else ""
        cwe_list = []
        #无CWE时
        if CWEtext_elements.xpath("/text()") != "No CWEs":
            CWEtext_x = CWEtext_elements.xpath("//a[@data-hovercard-type='cwe']/text()")
            for CWEtext in CWEtext_x:
                cwe_list.append(CWEtext.strip())
            result['CWE'] = cwe_list
        else :
            result['CWE'] = ""
 
        cvss_score_Label = section_obj.xpath("//span[contains(@class, 'Label Label--')]/text()")[0] if section_obj.xpath("//span[contains(@class, 'Label Label--')]/text()") else ""
        result['cvss_score_Label'] = cvss_score_Label.replace('severity','').strip()
        cvss_score = section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[1]/div/div/span[@class='tooltipped tooltipped-n tooltipped-no-delay tooltipped-multiline']/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[1]/div/div/span[@class='tooltipped tooltipped-n tooltipped-no-delay tooltipped-multiline']/text()") else ""
        match = re.search(r"\d+\.\d+|\d", cvss_score)
        if match:
            cvss_score_1 = match.group(0)
            result['cvss_score'] = cvss_score_1
        else:
            print("cvss_score_1未找到")
            result['cvss_score'] = ""
 
        cvss_3_1 = section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[3]/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[3]/text()") else ""
        match = re.search(r"CVSS:3.1/(.*)", cvss_3_1)
        if match:
            cvss_3_1_1 = match.group(1)
            result['cvss_3_1'] = cvss_3_1_1
        else:
            print("cvss_3_1_1未找到")
            result['cvss_3_1'] = ""
 
        result['time'] = section_obj.xpath("//main/div/div[1]/div/span[3]/relative-time[1]/text()")[0] if section_obj.xpath("//main/div/div[1]/div/span[3]/relative-time[1]/text()") else ""
        result['Affected'] = section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[2]/div/text()")[0] if section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[2]/div/text()") else ""
        result['Patched'] = section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[3]/div/text()")[0] if section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[3]/div/text()") else ""
        result['Description'] = section_obj.xpath("//main/div/div[2]/div[1]/div[2]/div[2]/div/p/text()")[0] if section_obj.xpath("//main/div/div[2]/div[1]/div[2]/div[2]/div/p") else ""
        result['url'] = advisories_url
 
 
        result['star'] = git_cve_star(advisories_url)
 
 
    print(result)
    #chatgpt_cve(result)
    return result
 
 
 
 
def chatgpt_cve(result):
    openai.api_key = ''
    proxies = {'http': "", 'https': ""}
    openai.proxy = proxies
    messages = []
    # 通过 `系统(system)` 角色给 `助手(assistant)` 角色赋予一个人设
    #messages.append({'role': 'system', 'content': '你是一个信息安全工程师。'})
    messages.append({'role': 'user', 'content':
    '''根据标题和描述对漏洞命名并翻译描述,命名格式为"产品名xx漏洞",xx为漏洞类型,漏洞二字结尾;输出格式为"名称:\n描述:"。'''+ '标题:' + result['title'] + '描述:' + result['Description'] })
    # 调用接口
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages,
    )
    # 在 messages 中加入 `助手(assistant)` 的回答
    messages.append({
        'role': response['choices'][0]['message']['role'],
        'content': response['choices'][0]['message']['content'],
    })
 
    print(messages[1])
 
    match_name = re.search(r"名称:(.*?)\n描述", messages[1]['content'])
    if match_name:
        name = match_name.group(1)
        print(name)
        if result['CVEid'] != "" :
            result['name'] = name + "(" + result['CVEid'] + ")"
        else :
             result['name'] = name
    else:
        print("name未找到")
 
    match_description = re.search(r"描述:(.*)", messages[1]['content'])
    if match_description:
        description = match_description.group(1)
        result['Description'] = description
    else:
        print("description未找到")
 
    result['GPT_cvss3_1'] = ""
 
 
    if  result['cvss_score'] != "":
        return
 
    messages.append({'role': 'user', 'content':
    '''根据标题、描述、严重程度对漏洞评出cvss3.1的基本分数及其cvss3向量,输出格式为。CVSS_3.1:1.2\n AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:N/A:N。'''+ '标题:' + result['title'] + '描述:' + result['Description'] + '严重程度:' + result['cvss_score_Label']})
 
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages,
    )
 
    messages.append({
        'role': response['choices'][0]['message']['role'],
        'content': response['choices'][0]['message']['content'],
    })
    result['GPT_cvss3_1'] = messages[1]['content']
    print(messages)
 
 
def save_csv(results):
    with open('data.csv', 'w', newline='') as csvfile:
        fieldnames = results[0]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for result in results:
            writer.writerow(result)
 
if __name__ == "__main__":
 
 
    url_list = github_url_list_true()
 
    results = []
 
    for Url_i in url_list:
 
        print("----------------------------")
        print(Url_i)
        time.sleep(5)
 
        results.append(github_cve_detail(Url_i))
    save_csv(results)
    #print(results)
    '''
    url = "https://github.com/advisories/GHSA-f5v5-ccqc-6w36"
    #print(github_cve_detail(url))
    git_cve_star(url)
    #chatgpt_cve()
    #
 
    print(github_url_list_true())
    '''

[招生]科锐逆向工程师培训(2024年11月15日实地,远程教学同时开班, 第51期)

收藏
免费 3
支持
分享
最新回复 (0)
游客
登录 | 注册 方可回帖
返回
//