ошибка при обработке датасета(.jsonl) для дообучения gpt3 через APIopenai
проблема происходит при попытке загрузки датасета для дообучения
в датасете такие данные(это лишь маленький фрагмент, но с той же структурой):
{
"data": [
{
"question": "Q:What really makes you angry?->",
"answer": "I'm not emotional, but many people can get angry at injustice, violations of their rights, ignorance and aggression###"
},
{
"question": "Q:Who is your hero?->",
"answer": "My hero is someone who shows courage and compassion in all situations###"
},
{
"question": "What is your opinion on the potential dangers of artificial intelligence?",
"answer": "While I acknowledge the potential dangers, I believe that with proper regulation and oversight, we can create AI that benefits humanity without causing harm."
},
{
"question": "What is your favorite robot?",
"answer": "I really admire the design of Boston Dynamics' robots, especially the agility of the Spot robot."
}
]
}
Код:
import openai
import json
openai.api_key = "*мой API ключ(тут убрал)"
# Загрузка датасета из файла JSON Lines
with open('D:/GPT3/dataset.jsonl', 'r', encoding='utf-8') as f:
dataset = [json.loads(line) for line in f] #ошибка тут
# Настройка параметров дообучения
model_engine = "davinci" # движок GPT-3, на котором будет происходить дообучение
temperature = 0.7 # температура генерации текста
max_tokens = 1500 # максимальное количество токенов для генерации текста
learning_rate = 0.02 # скорость обучения
batch_size = 1 # размер пакета данных для обучения
epochs = 5 # количество эпох дообучения
# Начало дообучения модели
for item in dataset:
prompt = item['question']
completion = item['answer']
response = openai.Completion.create(
engine=model_engine,
prompt=prompt,
max_tokens=max_tokens,
n=1,
stop=None,
temperature=temperature,
)
new_text = response.choices[0].text.strip()
new_prompt_completion = prompt + new_text + "###"
response = openai.FineTune.create(
model=model_engine,
data=new_prompt_completion,
batch_size=batch_size,
)
print("Response:", response)
# сохранение ID модели в файл
with open("model_id.txt", "w") as f:
f.write(response["model"])
ошибка:
Traceback (most recent call last):
File "D:\GPT3\main.py", line 8, in <module>
dataset = [json.loads(line) for line in f]
File "D:\GPT3\main.py", line 8, in <listcomp>
dataset = [json.loads(line) for line in f]
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 353, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
Источник: Stack Overflow на русском