因为是面向小白,实践向的,解释相关理论的会再发篇文章。
Q:我一点AI都不懂怎么办?
A:没事作者也是面向gpt编程,纯小白也可以尝试^_^
Q:为什么会想自己写一个大模型?现在的大模型已经很成熟了我为什么要自己写一个?
A:一个是最近的热点deepseek,感觉AI很有意思。但主要是觉得自己写一个很酷()事情都是从零到一的,我认为自己先写一个完全属于自己的模型,即使它一直报错,即使它回答的问题一直不对,在写完的那一刻的成就感也是无可比拟的。有成就感,有兴趣,才能一步步深入地学习了解^_^
python版本得高一点,我用的3.12,3.8不行。
如果老报错版本问题就直接下(没报错就用requirements.txt)
这里面的url是已经去重了的,正常流程如下
将抓取的 HTML 页面存储在 scraped
文件夹中,并压缩存档。
等挺久的,挂着睡觉了。默认它done了就是好了()


把extract_text.py里的save_parsed_file改成如下

从 HTML 中提取出文本内容并保存为 .txt
文件.
--如果中间有报错重新来的话,记得把原来提取的文件删掉,文件夹在scraped里
更改tokenize_text.py

下面的文件层级关系如下

用于处理数据集
实现 GPT 模型
用于实现 GPT 模型的一个基本组件——Transformer 块。
这是推理脚本,用于加载训练好的模型并进行推理(生成预测)
然后运行顺序:


精确的模型需要深入的理论和庞大的数据训练。虽然很人机,并且一点也不智能,但是已经向未知的领域迈出了第一步不是吗?
pip install numpy tqdm matplotlib
pip install numpy tqdm matplotlib
pip install numpy tqdm matplotlib
pip install numpy tqdm matplotlib
git clone https:
//github
.com
/JCPETERSON/OpenwebText
.git
cd
OpenwebText
git clone https:
//github
.com
/JCPETERSON/OpenwebText
.git
cd
OpenwebText
pip
install
-r requirements.txt
pip
install
-r requirements.txt
pip
install
beautifulsoup4 certifi chardet cssselect feedfinder2 feedparser htmlmin idna jieba3k lxml newspaper3k nltk numpy pandas pillow python-dateutil pytorch-pretrained-bert pytz pyyaml recordtype requests-
file
requests singledispatch six soupsieve spacy tinysegmenter tldextract tqdm urllib3 urlparse2 pycurl pebble chardet transformers
pip
install
beautifulsoup4 certifi chardet cssselect feedfinder2 feedparser htmlmin idna jieba3k lxml newspaper3k nltk numpy pandas pillow python-dateutil pytorch-pretrained-bert pytz pyyaml recordtype requests-
file
requests singledispatch six soupsieve spacy tinysegmenter tldextract tqdm urllib3 urlparse2 pycurl pebble chardet transformers
提取 URL
python extract_urls.py --single_file pushshift_dumps
/RS_v2_2005-06
.xz
想提取一个时间范围内的 URL
python extract_urls.py --year_start 2016 --year_end 2018
去重 URL
python deduplicate_urls.py --input_dir url_dumps
提取 URL
python extract_urls.py --single_file pushshift_dumps
/RS_v2_2005-06
.xz
想提取一个时间范围内的 URL
python extract_urls.py --year_start 2016 --year_end 2018
去重 URL
python deduplicate_urls.py --input_dir url_dumps
python312 download.py D:\Tools\openwebtext\URLs\RS_2011-01.bz2.deduped.txt --n_procs 100 --scraper raw --chunk_size 100000 --compress --timeout 30
python312 download.py D:\Tools\openwebtext\URLs\RS_2011-01.bz2.deduped.txt --n_procs 100 --scraper raw --chunk_size 100000 --compress --timeout 30
pip
install
--upgrade newspaper3k
pip
install
--upgrade newspaper3k
def
save_parsed_file(filename, text, out_dir):
file_path
=
os.path.join(out_dir, filename)
os.makedirs(os.path.dirname(file_path), exist_ok
=
True
)
with
open
(file_path,
'w'
, encoding
=
'utf-8'
) as handle:
handle.write(text)
def
save_parsed_file(filename, text, out_dir):
file_path
=
os.path.join(out_dir, filename)
os.makedirs(os.path.dirname(file_path), exist_ok
=
True
)
with
open
(file_path,
'w'
, encoding
=
'utf-8'
) as handle:
handle.write(text)
python312 extract_text.py --html_archive scraped
/RS_2011-01-1_data
.xz --n_procs 100
python312 extract_text.py --html_archive scraped
/RS_2011-01-1_data
.xz --n_procs 100
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_sm
import
spacy
import
io
import
argparse
import
glob
import
os
import
tqdm
from
multiprocessing
import
Pool
from
functools
import
partial
import
chardet
def
detect_encoding(file_path):
with
open
(file_path,
'rb'
) as f:
raw_data
=
f.read(
1024
)
result
=
chardet.detect(raw_data)
return
result[
'encoding'
]
or
'utf-8'
def
save_tokenized_text(output_dir, filename, text):
text_file
=
os.path.join(output_dir, filename)
os.makedirs(os.path.dirname(text_file), exist_ok
=
True
)
with io.
open
(text_file,
'w'
, encoding
=
'utf-8'
) as fo:
fo.write(text)
def
tokenizeSpacy(args):
nlp
=
spacy.load(
"en_core_web_sm"
)
extraction_file_paths
=
glob.glob(args.input_glob)
for
extraction_file_path
in
extraction_file_paths:
path, filename
=
os.path.split(extraction_file_path)
text_file
=
os.path.join(
args.output_dir, filename.replace(
'.txt'
,
'.tokenized.txt'
))
os.makedirs(os.path.dirname(text_file), exist_ok
=
True
)
file_encoding
=
detect_encoding(extraction_file_path)
try
:
with io.
open
(extraction_file_path,
'r'
, encoding
=
file_encoding) as fi, \
io.
open
(text_file,
'w'
, encoding
=
'utf-8'
) as fo:
omitted_line_count
=
0
for
line
in
fi:
if
len
(line.strip()) >
0
:
doc
=
nlp(line)
fo.write(
' '
.join([x.text
for
x
in
doc])
+
'\n'
)
else
:
omitted_line_count
+
=
1
print
(f
'Omitted {omitted_line_count} empty lines from {filename}'
)
except
UnicodeDecodeError:
print
(f
"Failed to decode {extraction_file_path} with encoding {file_encoding}. Skipping this file."
)
if
__name__
=
=
'__main__'
:
parser
=
argparse.ArgumentParser()
parser.add_argument(
'--input_glob'
,
type
=
str
, default
=
'*.txt'
)
parser.add_argument(
'--output_dir'
,
type
=
str
, default
=
'tokenized'
)
parser.add_argument(
'--tokenizer'
,
type
=
str
, default
=
'spacy'
, choices
=
[
'spacy'
,
'gpt2'
])
parser.add_argument(
'--combine'
,
type
=
int
, default
=
1e8
,
help
=
"min tokens per file in gpt2 mode"
)
parser.add_argument(
'--file_bs'
,
type
=
int
, default
=
10000
,
help
=
"files per batch in gpt2 mode"
)
args
=
parser.parse_args()
os.makedirs(args.output_dir, exist_ok
=
True
)
if
args.tokenizer
=
=
'spacy'
:
tokenizeSpacy(args)
else
:
print
(
"GPT-2 tokenizer is not implemented in this version."
)
import
spacy
import
io
import
argparse
import
glob
import
os
import
tqdm
from
multiprocessing
import
Pool
from
functools
import
partial
import
chardet
def
detect_encoding(file_path):
with
open
(file_path,
'rb'
) as f:
raw_data
=
f.read(
1024
)
result
=
chardet.detect(raw_data)
return
result[
'encoding'
]
or
'utf-8'
def
save_tokenized_text(output_dir, filename, text):
text_file
=
os.path.join(output_dir, filename)
os.makedirs(os.path.dirname(text_file), exist_ok
=
True
)
with io.
open
(text_file,
'w'
, encoding
=
'utf-8'
) as fo:
fo.write(text)
def
tokenizeSpacy(args):
nlp
=
spacy.load(
"en_core_web_sm"
)
extraction_file_paths
=
glob.glob(args.input_glob)
for
extraction_file_path
in
extraction_file_paths:
path, filename
=
os.path.split(extraction_file_path)
text_file
=
os.path.join(
args.output_dir, filename.replace(
'.txt'
,
'.tokenized.txt'
))
os.makedirs(os.path.dirname(text_file), exist_ok
=
True
)
file_encoding
=
detect_encoding(extraction_file_path)
try
:
with io.
open
(extraction_file_path,
'r'
, encoding
=
file_encoding) as fi, \
io.
open
(text_file,
'w'
, encoding
=
'utf-8'
) as fo:
omitted_line_count
=
0
for
line
in
fi:
if
len
(line.strip()) >
0
:
doc
=
nlp(line)
fo.write(
' '
.join([x.text
for
x
in
doc])
+
'\n'
)
else
:
omitted_line_count
+
=
1
print
(f
'Omitted {omitted_line_count} empty lines from {filename}'
)
except
UnicodeDecodeError:
print
(f
"Failed to decode {extraction_file_path} with encoding {file_encoding}. Skipping this file."
)
if
__name__
=
=
'__main__'
:
parser
=
argparse.ArgumentParser()
parser.add_argument(
'--input_glob'
,
type
=
str
, default
=
'*.txt'
)
parser.add_argument(
'--output_dir'
,
type
=
str
, default
=
'tokenized'
)
parser.add_argument(
'--tokenizer'
,
type
=
str
, default
=
'spacy'
, choices
=
[
'spacy'
,
'gpt2'
])
parser.add_argument(
'--combine'
,
type
=
int
, default
=
1e8
,
help
=
"min tokens per file in gpt2 mode"
)
parser.add_argument(
'--file_bs'
,
type
=
int
, default
=
10000
,
help
=
"files per batch in gpt2 mode"
)
args
=
parser.parse_args()
os.makedirs(args.output_dir, exist_ok
=
True
)
if
args.tokenizer
=
=
'spacy'
:
tokenizeSpacy(args)
else
:
print
(
"GPT-2 tokenizer is not implemented in this version."
)
python312 tokenize_text.py --input_glob
"parsed/RS_2011-01/*.txt"
--output_dir tokenized
python312 tokenize_text.py --input_glob
"parsed/RS_2011-01/*.txt"
--output_dir tokenized
gpt_project
/
├── model
/
│ ├── gpt.py
│ ├── transformer_block.py
├── data
/
│ ├── dataset.py
│ ├── tokenizer.py
│ ├── tokenized
/
├── train
/
│ ├── train.py
|——train_model
/
|
├── inference.py
gpt_project
/
├── model
/
│ ├── gpt.py
│ ├── transformer_block.py
├── data
/
│ ├── dataset.py
│ ├── tokenizer.py
│ ├── tokenized
/
├── train
/
│ ├── train.py
|——train_model
/
|
├── inference.py
import
torch
import
os
from
collections
import
Counter
from
transformers
import
AutoTokenizer
class
TextDataset(torch.utils.data.Dataset):
def
__init__(
self
, directory_path, seq_length, tokenizer):
self
.seq_length
=
seq_length
self
.tokenizer
=
tokenizer
self
.data
=
[]
self
.vocab
=
{}
self
.inverse_vocab
=
{}
word_counter
=
Counter()
for
filename
in
os.listdir(directory_path):
if
filename.endswith(
".tokenized.txt"
):
file_path
=
os.path.join(directory_path, filename)
with
open
(file_path,
"r"
, encoding
=
"utf-8"
) as f:
words
=
f.read().split()
word_counter.update(words)
self
.vocab
=
{word: idx
+
1
for
idx, (word, _)
in
enumerate
(word_counter.items())}
self
.vocab[
'<pad>'
]
=
0
self
.vocab[
'<unk>'
]
=
len
(
self
.vocab)
self
.inverse_vocab
=
{idx: word
for
word, idx
in
self
.vocab.items()}
for
filename
in
os.listdir(directory_path):
if
filename.endswith(
".tokenized.txt"
):
file_path
=
os.path.join(directory_path, filename)
with
open
(file_path,
"r"
, encoding
=
"utf-8"
) as f:
words
=
f.read().split()
token_ids
=
[
self
.vocab.get(word,
self
.vocab[
'<unk>'
])
for
word
in
words]
self
.data.append(token_ids)
self
.data
=
[
self
.pad_sequence(seq)
for
seq
in
self
.data]
def
__len__(
self
):
return
len
(
self
.data)
def
__getitem__(
self
, idx):
input_text
=
self
.data[idx]
input_ids
=
torch.tensor(input_text)
target_ids
=
input_ids.clone()
return
input_ids, target_ids
def
pad_sequence(
self
, seq):
if
len
(seq) <
self
.seq_length:
seq
+
=
[
self
.vocab[
'<pad>'
]]
*
(
self
.seq_length
-
len
(seq))
else
:
seq
=
seq[:
self
.seq_length]
return
seq
import
torch
import
os
from
collections
import
Counter
from
transformers
import
AutoTokenizer
class
TextDataset(torch.utils.data.Dataset):
def
__init__(
self
, directory_path, seq_length, tokenizer):
self
.seq_length
=
seq_length
self
.tokenizer
=
tokenizer
self
.data
=
[]
self
.vocab
=
{}
self
.inverse_vocab
=
{}
word_counter
=
Counter()
for
filename
in
os.listdir(directory_path):
if
filename.endswith(
".tokenized.txt"
):
file_path
=
os.path.join(directory_path, filename)
with
open
(file_path,
"r"
, encoding
=
"utf-8"
) as f:
words
=
f.read().split()
word_counter.update(words)
self
.vocab
=
{word: idx
+
1
for
idx, (word, _)
in
enumerate
(word_counter.items())}
self
.vocab[
'<pad>'
]
=
0
self
.vocab[
'<unk>'
]
=
len
(
self
.vocab)
self
.inverse_vocab
=
{idx: word
for
word, idx
in
self
.vocab.items()}
for
filename
in
os.listdir(directory_path):
if
filename.endswith(
".tokenized.txt"
):
file_path
=
os.path.join(directory_path, filename)
with
open
(file_path,
"r"
, encoding
=
"utf-8"
) as f:
words
=
f.read().split()
token_ids
=
[
self
.vocab.get(word,
self
.vocab[
'<unk>'
])
for
word
in
words]
self
.data.append(token_ids)
self
.data
=
[
self
.pad_sequence(seq)
for
seq
in
self
.data]
def
__len__(
self
):
return
len
(
self
.data)
def
__getitem__(
self
, idx):
input_text
=
self
.data[idx]
input_ids
=
torch.tensor(input_text)
target_ids
=
input_ids.clone()
return
input_ids, target_ids
def
pad_sequence(
self
, seq):
if
len
(seq) <
self
.seq_length:
seq
+
=
[
self
.vocab[
'<pad>'
]]
*
(
self
.seq_length
-
len
(seq))
else
:
seq
=
seq[:
self
.seq_length]
return
seq
import
torch
import
torch.nn as nn
import
os
import
sys
import
torch.nn.functional as F
project_root
=
os.path.abspath(os.path.join(os.path.dirname(__file__),
'..'
))
print
(
"Adding to sys.path:"
, project_root)
sys.path.append(project_root)
from
model.transformer_block
import
TransformerBlock
class
GPT(nn.Module):
def
__init__(
self
, vocab_size, embed_size, num_layers, num_heads, hidden_dim, max_length):
super
(GPT,
self
).__init__()
self
.hidden_dim
=
hidden_dim
class
GPT(nn.Module):
def
__init__(
self
, vocab_size, embed_size, num_heads, num_layers, max_length):
super
(GPT,
self
).__init__()
self
.embedding
=
nn.Embedding(vocab_size, embed_size)
self
.position_embedding
=
nn.Embedding(max_length, embed_size)
self
.blocks
=
nn.ModuleList([
TransformerBlock(embed_size, num_heads, embed_size
*
4
)
for
_
in
range
(num_layers)
])
self
.fc_out
=
nn.Linear(embed_size, vocab_size)
def
forward(
self
, x):
batch_size, seq_length
=
x.shape
positions
=
torch.arange(
0
, seq_length).expand(batch_size, seq_length)
x
=
self
.embedding(x)
+
self
.position_embedding(positions)
for
block
in
self
.blocks:
x
=
block(x)
return
self
.fc_out(x)
def
generate(
self
, input_ids, max_length
=
100
, temperature
=
1.0
, top_k
=
50
):
self
.
eval
()
generated_ids
=
input_ids
for
_
in
range
(max_length):
outputs
=
self
(generated_ids)
logits
=
outputs
logits
=
logits[:,
-
1
, :]
logits
=
logits
/
temperature
if
top_k >
0
:
top_k_values, top_k_indices
=
torch.topk(logits, top_k)
top_k_probs
=
F.softmax(top_k_values, dim
=
-
1
)
next_token
=
torch.multinomial(top_k_probs,
1
)
next_token
=
top_k_indices.gather(
-
1
, next_token)
else
:
probs
=
F.softmax(logits, dim
=
-
1
)
next_token
=
torch.multinomial(probs,
1
)
generated_ids
=
torch.cat([generated_ids, next_token], dim
=
-
1
)
return
generated_ids
import
torch
import
torch.nn as nn
import
os
import
sys
import
torch.nn.functional as F
project_root
=
os.path.abspath(os.path.join(os.path.dirname(__file__),
'..'
))
print
(
"Adding to sys.path:"
, project_root)
sys.path.append(project_root)
from
model.transformer_block
import
TransformerBlock
class
GPT(nn.Module):
def
__init__(
self
, vocab_size, embed_size, num_layers, num_heads, hidden_dim, max_length):
super
(GPT,
self
).__init__()
self
.hidden_dim
=
hidden_dim
class
GPT(nn.Module):
def
__init__(
self
, vocab_size, embed_size, num_heads, num_layers, max_length):
[招生]科锐逆向工程师培训(2025年3月11日实地,远程教学同时开班, 第52期)!
最后于 2025-1-30 17:24
被道总行走江离编辑
,原因: