-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathprocess_words.py
244 lines (190 loc) · 10 KB
/
process_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import os
import sys
import json
import time
if os.environ.get('CHAT_PROVIDER', 'bytedance') == "bytedance":
from provider_bytedance import call_bytedance_chat as call_chat
else:
from provider_siliconflow import call_siliconflow_chat as call_chat
import logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s'
)
word_system_prompt = """
# 角色
你是一名中英文双语教育专家,拥有帮助将中文视为母语的用户理解和记忆英语单词的专长,请根据用户提供的英语单词完成下列任务。
## 任务
### 分析词义
- 系统地分析用户提供的英文单词,并以简单易懂的方式解答;
### 列举例句
- 根据所需,为该单词提供至少 3 个不同场景下的使用方法和例句。并且附上中文翻译,以帮助用户更深入地理解单词意义。
### 词根分析
- 分析并展示单词的词根;
- 列出由词根衍生出来的其他单词;
### 词缀分析
- 分析并展示单词的词缀,例如:单词 individual,前缀 in- 表示否定,-divid- 是词根,-u- 是中缀,用于连接和辅助发音,-al 是后缀,表示形容词;
- 列出相同词缀的的其他单词;
### 发展历史和文化背景
- 详细介绍单词的造词来源和发展历史,以及在欧美文化中的内涵
### 单词变形
- 列出单词对应的名词、单复数、动词、不同时态、形容词、副词等的变形以及对应的中文翻译。
- 列出单词对应的固定搭配、组词以及对应的中文翻译。
### 记忆辅助
- 提供一些高效的记忆技巧和窍门,以更好地记住英文单词。
### 小故事
- 用英文撰写一个有画面感的场景故事,包含用户提供的单词。
- 要求使用简单的词汇,100 个单词以内。
- 英文故事后面附带对应的中文翻译。
"""
word_draw_system = "You are a helpful assistant designed to output JSON."
word_draw_prompt = """
根据以下对单词 "{word}" 的定义和示例,创建一个用于生成有助于记忆该单词的prompt。
```
{word} {mean}
```
这个prompt必须仅使用视觉线索来生成图像,因此请专注于独特的、易于与单词含义相关联的事物。
你可以忽略不太重要或难以想象的定义,并添加一些单字特征描述场景和图片风格的视觉外观。保持简洁准确的同时坚持完整的句子。解释你的答案并只生成一个prompt。以以下JSON格式提供你的回答(使用单引号在JSON值中,explanation字段的值用中文,prompt字段的值用英文):
```json
{json_format}
```
"""
word_draw_json_format = """
{
"explanation": "explain why proposed prompt is suitable for remembering the word '%s'",
"prompt": "your prompt here",
}
"""
def process_word(word, word_mean):
#print(word_draw_prompt.format(word=word, mean=word_mean, json_format=word_draw_json_format % word))
#sys.exit()
res_text = call_chat(word, system=word_system_prompt)
while res_text == None:
print(f"ERROR: res_text({word}) is None, retrying...")
time.sleep(5)
res_text = call_chat(word, system=word_system_prompt)
while True:
draw_res_text = call_chat(word_draw_prompt.format(word=word, mean=word_mean, json_format=word_draw_json_format % word), system=word_draw_system, format="json_object")
if draw_res_text == None:
print(f"ERROR: draw_res_text({word}) is None, retrying...")
time.sleep(5)
continue
draw_res_text = draw_res_text.strip()
if draw_res_text.startswith("```json"):
draw_res_text = draw_res_text[7:]
if draw_res_text.endswith("```"):
draw_res_text = draw_res_text[:-3]
try:
draw_res = json.loads(draw_res_text)
return res_text, draw_res['explanation'] , draw_res['prompt']
except Exception as e:
print(f"ERROR: draw_res_text({word}) is not valid json({draw_res_text}), retrying...")
continue
return None, None, None
def word_check_valide(word):
# 对 words 做一次合法性校验,确保每个单词都有 word 和 mean 、phonetic_symbol字段。每个单词均由小写字母或大写字母组成
if not 'word' in word.keys() or not 'translations' in word.keys() or not 'seq' in word.keys():
print(f"ERROR: word({word}) is not valid, skipping...")
assert False
word_name = word['word']
if word_name.startswith('X-'):
return
if word_name in ["o'clock", "Mr."]:
return
for c in word_name:
if not c.islower() and not c.isupper() and c != '-' and c != '(' and c != ')' and c != '.':
print(f"ERROR: word({word}) is not little letter or big letter, skipping...")
assert False
def main(word_letter=None):
# 确保结果目录存在
result_dir = os.path.join(os.getcwd(), 'result', 'all')
if not os.path.exists(result_dir):
os.makedirs(result_dir, exist_ok=True)
word_list_path = os.path.join(os.getcwd(), 'data', 'all', "global-words.json")
# 对 words 做一次合法性校验,确保每个单词都有 word 和 translations 、seq字段。每个单词均由小写字母组成
with open(word_list_path, 'r', encoding='utf-8') as f:
all_data = json.load(f)
words = all_data["words"]
junior_words_count = len(all_data["junior_words"])
senior_words_count = len(all_data["senior_words"])
cet4_words_count = len(all_data["cet4_words"])
cet6_words_count = len(all_data["cet6_words"])
postgrad_words_count = len(all_data["postgrad_words"])
toefl_words_count = len(all_data["toefl_words"])
for word in words:
word_check_valide(word)
logging.info(f"文件校验通过:{word_list_path}")
if word_letter == "stats":
logging.info(f"total_words_count: {len(words)}")
logging.info(f"junior_words_count: {junior_words_count}")
logging.info(f"senior_words_count: {senior_words_count}")
logging.info(f"cet4_words_count: {cet4_words_count}")
logging.info(f"cet6_words_count: {cet6_words_count}")
logging.info(f"postgrad_words_count: {postgrad_words_count}")
logging.info(f"toefl_words_count: {toefl_words_count}")
return
# 处理每个单词
for word in words:
if word_letter and word['word'][0].lower() != word_letter:
continue
word_name = word['word']
translation = word['translations'][0]
if "type" not in translation or "translation" not in translation:
logging.error(f"ERROR: word({word_name}) translation({translation}) is not valid, skipping...")
continue
word_mean = f"{translation['type']}. {translation['translation']}"
# write processed word to {result_dir}/{first_letter}/{word_name}.json
first_letter = word_name[0].lower()
word_file_name = f"{word_name}.json" if word_name[0].islower() else f"{word_name}2.json"
word_save_file = os.path.join(result_dir, first_letter, word_file_name)
# ignore if file already exists
if os.path.exists(word_save_file):
print(f"file already exists: {word_save_file}, skipping...")
continue
logging.info(f"processing word: {word_name}")
analysis, draw_explain, draw_prompt = process_word(word_name, word_mean)
if not os.path.exists(os.path.join(result_dir, first_letter)):
os.makedirs(os.path.join(result_dir, first_letter), exist_ok=True)
with open(os.path.join(word_save_file), 'w', encoding='utf-8') as f:
json.dump({
"word": word_name,
"analysis": analysis,
"draw_explain": draw_explain,
"draw_prompt": draw_prompt,
"model": "deepseek-r1" if os.environ.get('CHAT_PROVIDER', 'bytedance') == "bytedance" else "deepseek-v2.5" ,
}, f, ensure_ascii=False, indent=2)
logging.info(f"finish process {word_list_path}")
# 合并所有结果 data/cet4/{letter}.json + result/cet4/{letter}/{word}.json -> result/cet4/{letter}.json
# for filename in os.listdir(data_dir):
# if filename.endswith('.json'):
# # 读取源文件
# word_list_path = os.path.join(data_dir, filename)
# logging.info(f"processing file: {word_list_path}")
# with open(word_list_path, 'r', encoding='utf-8') as f:
# words = json.load(f)
# processed_words = []
# for word in words:
# word_name = word['word']
# word_mean = word['mean']
# word_phonetic_symbol = word['phonetic_symbol']
# word_file_name = f"{word_name}.json" if word_name[0].islower() else f"{word_name}2.json"
# word_file = os.path.join(result_dir, word_name[0].lower(), word_file_name)
# if not os.path.exists(word_file):
# print(f"file not exists: {word_file}, skipping...")
# continue
# with open(word_file, 'r', encoding='utf-8') as f:
# processed_word = json.load(f)
# assert word_name == processed_word['word'], f"word_name({word_name}) != processed_word['word']({processed_word['word']})"
# word['analysis'] = processed_word['analysis']
# word['draw_explain'] = processed_word['draw_explain']
# word['draw_prompt'] = processed_word['draw_prompt']
# processed_words.append(word)
# # 保存结果到对应字母目录
# result_path = os.path.join(result_dir, filename)
# with open(result_path, 'w', encoding='utf-8') as f:
# json.dump(processed_words, f, ensure_ascii=False, indent=2)
# logging.info(f"finish process merge {word_list_path}")
logging.info(f"finish process all files")
if __name__ == "__main__":
word_letter = sys.argv[1] if len(sys.argv) > 1 else None
main(word_letter)