[原创]爬虫学习-GitHub安全公告页爬取-WEB安全-看雪-安全社区|安全招聘|kanxue.com

[原创]爬虫学习-GitHub安全公告页爬取

发表于: 2023-4-25 11:10 792

[原创]爬虫学习-GitHub安全公告页爬取

hml189

2023-4-25 11:10

792

学习爬虫编写，通过xpath从 https://github.com/advisories获取公告内容
图片描述
获取到信息后可以将标题和内容交给chatgpt取名，再翻译内容，在没有cvss评分的时候，将low、Moderate、High、Critical等级交给chatgpt打分
爬虫部分效果

chatgpt效果，有的时候不太理想，打分部分未调试，免费领的key过期了。。
图片描述

代码

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

from lxml import etree
import requests
import openai
import re
from bs4 import BeautifulSoup
import time
import csv
 
'''
#旧，使用BeautifulSoup获取公告链接，在公告页面判断公告是否被去除,慢很多
def github_URL_list():
    url = "https://github.com/advisories"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    url_list = []
 
    for link in soup.find_all("a", {"class": "Link--primary"}):
        if "/advisories/" in link.get("href"):
            url = "https://github.com" + link.get("href")
            if advisories_true(url) :
                url_list.append(url)
    print(url_list)
    return url_list
 
def advisories_true(advisories_url):
    page = requests.get(advisories_url)
    html = etree.HTML(page.content)
    not_withdraw = html.xpath("//main/div/div[2]/span/text()")
    if not_withdraw != "Withdrawn" :
        return True
    else:
        return False   
'''
 
#从https://github.com/advisories获取公告，并排除掉被撤回（withdrawn）的公告
def github_url_list_true():
    url_list = []
    for i in range(1,3):
 
        url = "https://github.com/advisories?page=" + str(i)
 
        page = requests.get(url)
        html = etree.HTML(page.content)
        sections = html.xpath("/html/body/div[1]/div/main/div/div[2]/div[2]/div[1]/div[position()>1]")
 
        time.sleep(3)
 
        for section in sections:
            section_obj = etree.HTML(etree.tostring(section).decode())
 
            withdraw = section_obj.xpath("//div/span[@title='Label: withdrawn']/text()")[0] if section_obj.xpath("//div/span[@title='Label: withdrawn']/text()") else ""
 
            if withdraw.strip() =="withdrawn":
                continue
 
            advisories_url = section_obj.xpath("//div/a/@href")[0] if section_obj.xpath("//div/a/@href") else ""
            advisories_url = "https://github.com" + advisories_url
            url_list.append(advisories_url)
    return url_list
 
 
 
 
#获取项目的star数
def git_cve_star(advisories_url):
    advisories_url_page = requests.get(advisories_url)
    advisories_url_html = etree.HTML(advisories_url_page.content)
    github_code_url = advisories_url_html.xpath("/html/body/div[1]/div/main/div/div[2]/div[2]/div[5]/div/a/@href")[0] if advisories_url_html.xpath("/html/body/div[1]/div/main/div/div[2]/div[2]/div[5]/div/a/@href") else ""
    github_link = re.search("https://github.com/", advisories_url)
    if github_link:
        Source_code_url_page = requests.get(github_code_url)
        soup = BeautifulSoup(Source_code_url_page.text, "html.parser")
        element = soup.find('span', {'id': 'repo-stars-counter-star'})
        value = element.text
    else:
        value = 'none'
    return value
 
#在公告页面获取信息
def github_cve_detail(advisories_url):
 
    page = requests.get(advisories_url)
    html = etree.HTML(page.content)
    sections = html.xpath("/html/body/div[1]")
 
    for section in sections:
 
        section_obj = etree.HTML(etree.tostring(section).decode())
        result = {}
 
        CVEid = section_obj.xpath("//main/div/div[2]/div[2]/div[3]/div/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[3]/div/text()") else ""
        if CVEid.strip() != "No known CVE" :
            result['CVEid'] = CVEid.strip()
        else :
            result['CVEid'] = ""
 
        title = section_obj.xpath("//main/div/div[1]/h2/text()")[0] if section_obj.xpath("//main/div/div[1]/h2/text()") else ""
        result['title'] = title.strip()
 
        #CWEtext_elements = section_obj.xpath("//main/div/div[2]/div[2]/div[2]/div/a[@data-hovercard-type='cwe']/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[2]/div/a[@data-hovercard-type='cwe']/text()") else ""
        CWEtext_elements = section_obj.xpath("//main/div/div/div[2]/div[2]/div")[0] if section_obj.xpath("//main/div/div[2]/div/div[2]/div") else ""
        cwe_list = []
        #无CWE时
        if CWEtext_elements.xpath("/text()") != "No CWEs":
            CWEtext_x = CWEtext_elements.xpath("//a[@data-hovercard-type='cwe']/text()")
            for CWEtext in CWEtext_x:
                cwe_list.append(CWEtext.strip())
            result['CWE'] = cwe_list
        else :
            result['CWE'] = ""
 
        cvss_score_Label = section_obj.xpath("//span[contains(@class, 'Label Label--')]/text()")[0] if section_obj.xpath("//span[contains(@class, 'Label Label--')]/text()") else ""
        result['cvss_score_Label'] = cvss_score_Label.replace('severity','').strip()
        cvss_score = section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[1]/div/div/span[@class='tooltipped tooltipped-n tooltipped-no-delay tooltipped-multiline']/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[1]/div/div/span[@class='tooltipped tooltipped-n tooltipped-no-delay tooltipped-multiline']/text()") else ""
        match = re.search(r"\d+\.\d+|\d", cvss_score)
        if match: 
            cvss_score_1 = match.group(0)
            result['cvss_score'] = cvss_score_1
        else: 
            print("cvss_score_1未找到")
            result['cvss_score'] = ""
 
        cvss_3_1 = section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[3]/text()")[0] if section_obj.xpath("//main/div/div[2]/div[2]/div[1]/div[3]/text()") else ""
        match = re.search(r"CVSS:3.1/(.*)", cvss_3_1)
        if match: 
            cvss_3_1_1 = match.group(1)
            result['cvss_3_1'] = cvss_3_1_1
        else: 
            print("cvss_3_1_1未找到")
            result['cvss_3_1'] = ""
 
        result['time'] = section_obj.xpath("//main/div/div[1]/div/span[3]/relative-time[1]/text()")[0] if section_obj.xpath("//main/div/div[1]/div/span[3]/relative-time[1]/text()") else ""
        result['Affected'] = section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[2]/div/text()")[0] if section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[2]/div/text()") else ""
        result['Patched'] = section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[3]/div/text()")[0] if section_obj.xpath("//main/div/div[2]/div[1]/div[1]/div/div/div[3]/div/text()") else ""
        result['Description'] = section_obj.xpath("//main/div/div[2]/div[1]/div[2]/div[2]/div/p/text()")[0] if section_obj.xpath("//main/div/div[2]/div[1]/div[2]/div[2]/div/p") else ""
        result['url'] = advisories_url
 
 
        result['star'] = git_cve_star(advisories_url)
 
 
    print(result)
    #chatgpt_cve(result)
    return result
 
 
 
 
def chatgpt_cve(result):
    openai.api_key = ''
    proxies = {'http': "", 'https': ""}
    openai.proxy = proxies
    messages = []
    # 通过 `系统(system)` 角色给 `助手(assistant)` 角色赋予一个人设
    #messages.append({'role': 'system', 'content': '你是一个信息安全工程师。'})
    messages.append({'role': 'user', 'content': 
    '''根据标题和描述对漏洞命名并翻译描述，命名格式为"产品名xx漏洞"，xx为漏洞类型,漏洞二字结尾；输出格式为"名称：\n描述:"。'''+ '标题：' + result['title'] + '描述：' + result['Description'] })
    # 调用接口
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages,
    )
    # 在 messages 中加入 `助手(assistant)` 的回答
    messages.append({
        'role': response['choices'][0]['message']['role'],
        'content': response['choices'][0]['message']['content'],
    })
 
    print(messages[1])
 
    match_name = re.search(r"名称：(.*?)\n描述", messages[1]['content'])
    if match_name: 
        name = match_name.group(1)
        print(name)
        if result['CVEid'] != "" :
            result['name'] = name + "(" + result['CVEid'] + ")"
        else :
             result['name'] = name
    else: 
        print("name未找到")
 
    match_description = re.search(r"描述：(.*)", messages[1]['content'])
    if match_description: 
        description = match_description.group(1)
        result['Description'] = description
    else: 
        print("description未找到")
 
    result['GPT_cvss3_1'] = ""
 
 
    if  result['cvss_score'] != "":
        return
 
    messages.append({'role': 'user', 'content': 
    '''根据标题、描述、严重程度对漏洞评出cvss3.1的基本分数及其cvss3向量，输出格式为。CVSS_3.1：1.2\n AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:N/A:N。'''+ '标题：' + result['title'] + '描述：' + result['Description'] + '严重程度：' + result['cvss_score_Label']})
 
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages,
    )
 
    messages.append({
        'role': response['choices'][0]['message']['role'],
        'content': response['choices'][0]['message']['content'],
    })
    result['GPT_cvss3_1'] = messages[1]['content']
    print(messages)
 
 
def save_csv(results):
    with open('data.csv', 'w', newline='') as csvfile:
        fieldnames = results[0]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for result in results:
            writer.writerow(result)
 
if __name__ == "__main__":
 
 
    url_list = github_url_list_true()
 
    results = []
 
    for Url_i in url_list:
 
        print("----------------------------")
        print(Url_i)
        time.sleep(5)
 
        results.append(github_cve_detail(Url_i))
    save_csv(results)
    #print(results)
    '''
    url = "https://github.com/advisories/GHSA-f5v5-ccqc-6w36"
    #print(github_cve_detail(url))
    git_cve_star(url)
    #chatgpt_cve()
    # 
 
    print(github_url_list_true())
    '''