
学べること
- ナレッジベースを構築する。
- 関連するドキュメントを検索する 取得 step を含む RAG アプリケーションを作成する。
- Weave で取得 step をトラッキングする。
- LLM judge を使って RAG アプリケーションを評価し、コンテキストの適合率を測定する。
- カスタムのスコアリング関数を定義する。
前提条件
- W&Bアカウント
- Python 3.10+ または Node.js 18+
- 必要なパッケージがインストールされていること:
- Python:
pip install weave openai - TypeScript:
npm install weave openai
- Python:
- OpenAI APIキー が環境変数として設定されていること。
ナレッジベースを構築する
- Python
- TypeScript
from openai import OpenAI
import weave
from weave import Model
import numpy as np
import json
import asyncio
articles = [
"Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial “tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too,” one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.",
"Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.",
"Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if its stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities",
"Rivian and Lucid shares plunge after weak EV earnings reports Shares of electric vehicle makers Rivian and Lucid fell Thursday after the companies reported stagnant production in their fourth-quarter earnings after the bell Wednesday. Rivian shares sank about 25 percent, and Lucids stock dropped around 17 percent. Rivian forecast it will make 57,000 vehicles in 2024, slightly less than the 57,232 vehicles it produced in 2023. Lucid said it expects to make 9,000 vehicles in 2024, more than the 8,428 vehicles it made in 2023.",
"Mauritius blocks Norwegian cruise ship over fears of a potential cholera outbreak Local authorities on Sunday denied permission for the Norwegian Dawn ship, which has 2,184 passengers and 1,026 crew on board, to access the Mauritius capital of Port Louis, citing “potential health risks.” The Mauritius Ports Authority said Sunday that samples were taken from at least 15 passengers on board the cruise ship. A spokesperson for the U.S.-headquartered Norwegian Cruise Line Holdings said Sunday that 'a small number of guests experienced mild symptoms of a stomach-related illness' during Norwegian Dawns South Africa voyage.",
"Intuitive Machines lands on the moon in historic first for a U.S. company Intuitive Machines Nova-C cargo lander, named Odysseus after the mythological Greek hero, is the first U.S. spacecraft to soft land on the lunar surface since 1972. Intuitive Machines is the first company to pull off a moon landing — government agencies have carried out all previously successful missions. The company's stock surged in extended trading Thursday, after falling 11 percent in regular trading.",
"Lunar landing photos: Intuitive Machines Odysseus sends back first images from the moon Intuitive Machines cargo moon lander Odysseus returned its first images from the surface. Company executives believe the lander caught its landing gear sideways on the moon's surface while touching down and tipped over. Despite resting on its side, the company's historic IM-1 mission is still operating on the moon.",
]
def docs_to_embeddings(docs: list) -> list:
openai = OpenAI()
document_embeddings = []
for doc in docs:
response = (
openai.embeddings.create(input=doc, model="text-embedding-3-small")
.data[0]
.embedding
)
document_embeddings.append(response)
return document_embeddings
article_embeddings = docs_to_embeddings(articles) # 注: 通常、この処理は記事に対して一度だけ実行し、埋め込みとメタデータをデータベースに格納します
require('dotenv').config();
import { OpenAI } from 'openai';
import * as weave from 'weave';
interface Article {
text: string;
embedding?: number[];
}
const articles: Article[] = [
{
text: `Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too, one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.`
},
{
text: `Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.`
},
{
text: `Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if its stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities`
}
];
function cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
const docsToEmbeddings = weave.op(async function(docs: Article[]): Promise<Article[]> {
const openai = new OpenAI();
const enrichedDocs = await Promise.all(docs.map(async (doc) => {
const response = await openai.embeddings.create({
input: doc.text,
model: "text-embedding-3-small"
});
return {
...doc,
embedding: response.data[0].embedding
};
}));
return enrichedDocs;
});
RAG アプリを作成する
get_most_relevant_document を weave.op() デコレータでラップし、Model クラスを作成します。取得関数を weave.op() でラップすると、呼び出しごとにその入力と出力を Weave で取得できるようになり、後で取得ステップを確認できるようになります。weave.init('<team-name>/rag-quickstart') を呼び出して、後で確認できるように関数のすべての入力と出力のトラッキングを開始します。チーム名を指定しない場合、出力は W&B のデフォルト team または entity に記録されます。
- Python
- TypeScript
from openai import OpenAI
import weave
from weave import Model
import numpy as np
import asyncio
@weave.op()
def get_most_relevant_document(query):
openai = OpenAI()
query_embedding = (
openai.embeddings.create(input=query, model="text-embedding-3-small")
.data[0]
.embedding
)
similarities = [
np.dot(query_embedding, doc_emb)
/ (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
for doc_emb in article_embeddings
]
# 最も類似したドキュメントのインデックスを取得する
most_relevant_doc_index = np.argmax(similarities)
return articles[most_relevant_doc_index]
class RAGModel(Model):
system_message: str
model_name: str = "gpt-3.5-turbo-1106"
@weave.op()
def predict(self, question: str) -> dict: # 注: `question` は後で評価行からデータを選択するために使用されます
from openai import OpenAI
context = get_most_relevant_document(question)
client = OpenAI()
query = f"""Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know."
Context:
\"\"\"
{context}
\"\"\"
Question: {question}"""
response = client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": self.system_message},
{"role": "user", "content": query},
],
temperature=0.0,
response_format={"type": "text"},
)
answer = response.choices[0].message.content
return {'answer': answer, 'context': context}
# team 名と project 名を設定します
weave.init('<team-name>/rag-quickstart')
model = RAGModel(
system_message="You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source."
)
model.predict("What significant result was reported about Zealand Pharma's obesity trial?")
class RAGModel {
private openai: OpenAI;
private systemMessage: string;
private modelName: string;
private articleEmbeddings: Article[];
constructor(config: {
systemMessage: string;
modelName?: string;
articleEmbeddings: Article[];
}) {
this.openai = new OpenAI();
this.systemMessage = config.systemMessage;
this.modelName = config.modelName || "gpt-3.5-turbo-1106";
this.articleEmbeddings = config.articleEmbeddings;
this.predict = weave.op(this, this.predict);
}
async predict(question: string): Promise<{
answer: string;
context: string;
}> {
const context = await this.getMostRelevantDocument(question);
const response = await this.openai.chat.completions.create({
model: this.modelName,
messages: [
{ role: "system", content: this.systemMessage },
{ role: "user", content: `Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know."
Context:
"""
${context}
"""
Question: ${question}` }
],
temperature: 0
});
return {
answer: response.choices[0].message.content || "",
context
};
}
}
LLM judge を用いた評価
スコアリング関数を定義する
question は行の辞書から取得されます。output はモデルの出力です。モデルへの入力も、その入力引数に基づいてサンプルから取得されるため、ここでの question も同様です。この例では async 関数を使用しているので、並列に実行できます。async の簡単な入門については、Python asyncio ドキュメント を参照してください。
- Python
- TypeScript
from openai import OpenAI
import weave
import asyncio
@weave.op()
async def context_precision_score(question, output):
context_precision_prompt = """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.
Output in only valid JSON format.
question: {question}
context: {context}
answer: {answer}
verdict: """
client = OpenAI()
prompt = context_precision_prompt.format(
question=question,
context=output['context'],
answer=output['answer'],
)
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": prompt}],
response_format={ "type": "json_object" }
)
response_message = response.choices[0].message
response = json.loads(response_message.content)
return {
"verdict": int(response["verdict"]) == 1,
}
questions = [
{"question": "What significant result was reported about Zealand Pharma's obesity trial?"},
{"question": "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?"},
{"question": "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?"},
{"question": "What were Rivian and Lucid's vehicle production forecasts for 2024?"},
{"question": "Why was the Norwegian Dawn cruise ship denied access to Mauritius?"},
{"question": "Which company achieved the first U.S. moon landing since 1972?"},
{"question": "What issue did Intuitive Machines' lunar lander encounter upon landing on the moon?"}
]
evaluation = weave.Evaluation(dataset=questions, scorers=[context_precision_score])
asyncio.run(evaluation.evaluate(model)) # 注: 評価するモデルを定義する必要があります
const contextPrecisionScore = weave.op(async function(args: {
datasetRow: QuestionRow;
modelOutput: { answer: string; context: string; }
}): Promise<ScorerResult> {
const openai = new OpenAI();
const prompt = `Given question, answer and context verify if the context was useful...`;
const response = await openai.chat.completions.create({
model: "gpt-4-turbo-preview",
messages: [{ role: "user", content: prompt }],
response_format: { type: "json_object" }
});
const result = JSON.parse(response.choices[0].message.content || "{}");
return {
verdict: parseInt(result.verdict) === 1
};
});
const evaluation = new weave.Evaluation({
dataset: createQuestionDataset(),
scorers: [contextPrecisionScore]
});
await evaluation.evaluate({
model: weave.op((args: { datasetRow: QuestionRow }) =>
model.predict(args.datasetRow.question)
)
});
任意: Scorer クラスを定義する
Scorer クラスが役立ちます。以下の手順では、どのような場合に定義すべきかと、その方法を示します。
アプリケーションによっては、カスタム評価クラスを作成したい場合があります。たとえば、チャットモデルやプロンプトなどのパラメーターを持つ標準化済みの LLMJudge クラス、各行に対するカスタムスコアリング、集計スコアのカスタム計算を作成したいことがあります。Weave にはそのまま使用できる Scorer クラスがいくつか用意されており、カスタム Scorer も簡単に作成できます。以下の例では、カスタム class CorrectnessLLMJudge(Scorer) を作成する方法を示します。
大まかには、カスタム Scorer を作成する手順は次のとおりです。
weave.flow.scorer.Scorerを継承するカスタムクラスを定義します。score関数をオーバーライドし、関数の各 call をトラッキングしたい場合は@weave.op()を追加します。- この関数では、モデルの予測結果が渡される
output引数を定義する必要があります。モデルが “None” を返す場合に備えて、タイプOptional[dict]として定義してください。 - 残りの引数は、汎用的な
Anyまたはdictにすることも、weave.Evaluateクラスを使用してモデルを評価する際に使うデータセットから特定の列を選ぶこともできます。preprocess_model_inputを使用する場合、これらの引数名は、その処理後の単一行の列名またはキーと完全に一致している必要があります。
- この関数では、モデルの予測結果が渡される
- 任意:
summarize関数をオーバーライドして、集計スコアの計算をカスタマイズします。デフォルトでは、カスタム関数を定義しない場合、Weave はweave.flow.scorer.auto_summarize関数を使用します。- この関数には
@weave.op()デコレーターが必要です。
- この関数には
- Python
- TypeScript
from weave import Scorer
class CorrectnessLLMJudge(Scorer):
prompt: str
model_name: str
device: str
@weave.op()
async def score(self, output: Optional[dict], query: str, answer: str) -> Any:
"""pred、query、target を比較して予測の正確性をスコアリングします。
Args:
- output: 評価対象のモデルから提供される dict
- query: データセットで定義された質問
- answer: データセットで定義された正解
Returns:
- 単一の dict {メトリクス名: 単一の評価値}"""
# get_model は、指定したパラメーターに基づいてモデルを取得する汎用関数として定義されています (OpenAI、HF など)
eval_model = get_model(
model_name = self.model_name,
prompt = self.prompt
device = self.device,
)
# 評価を高速化するための非同期評価です。async である必要はありません
grade = await eval_model.async_predict(
{
"query": query,
"answer": answer,
"result": output.get("result"),
}
)
# 出力のパース。pydantic を使うと、より堅牢に実装できます
evaluation = "incorrect" not in grade["text"].strip().lower()
# Weave に表示される列名
return {"correct": evaluation}
@weave.op()
def summarize(self, score_rows: list) -> Optional[dict]:
"""スコアリング関数によって各行に対して計算されたすべてのスコアを集計します。
Args:
- score_rows: dict のリスト。各 dict はメトリクスとスコアを持ちます
Returns:
- 入力と同じ構造のネストされた dict"""
# 何も指定しない場合は weave.flow.scorer.auto_summarize 関数が使用されます
# return auto_summarize(score_rows)
valid_data = [x.get("correct") for x in score_rows if x.get("correct") is not None]
count_true = list(valid_data).count(True)
int_data = [int(x) for x in valid_data]
sample_mean = np.mean(int_data) if int_data else 0
sample_variance = np.var(int_data) if int_data else 0
sample_error = np.sqrt(sample_variance / len(int_data)) if int_data else 0
# 追加の "correct" レイヤーは必須ではありませんが、UI に構造を持たせられます
return {
"correct": {
"true_count": count_true,
"true_fraction": sample_mean,
"stderr": sample_error,
}
}
この機能はまだ TypeScript では利用できません。
Evaluation の scorers 引数に渡します。
- Python
- TypeScript
evaluation = weave.Evaluation(dataset=questions, scorers=[CorrectnessLLMJudge()])
この機能はまだ TypeScript では利用できません。
まとめ
- LLM Call と取得ステップの関数を
weave.op()でラップします。 - オプション:
predict関数とアプリケーションの詳細を含むModelのサブクラスを作成します。 - 評価用の例を収集します。
- 1 つの例をスコアリングする評価関数を作成します。
Evaluationクラスを使用して、例に対する評価を実行します。
Evaluation の非同期実行により、OpenAI や Anthropic などのモデルで rate limit に達することがあります。これを防ぐには、たとえば WEAVE_PARALLELISM=3 のように、並列ワーカー数を制限する環境変数を設定できます。
コード全体は次のとおりです。
- Python
- TypeScript
from openai import OpenAI
import weave
from weave import Model
import numpy as np
import json
import asyncio
# 評価に使用するサンプル
articles = [
"Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial “tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too,” one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.",
"Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.",
"Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if it's stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities",
"Rivian and Lucid shares plunge after weak EV earnings reports Shares of electric vehicle makers Rivian and Lucid fell Thursday after the companies reported stagnant production in their fourth-quarter earnings after the bell Wednesday. Rivian shares sank about 25 percent, and Lucids stock dropped around 17 percent. Rivian forecast it will make 57,000 vehicles in 2024, slightly less than the 57,232 vehicles it produced in 2023. Lucid said it expects to make 9,000 vehicles in 2024, more than the 8,428 vehicles it made in 2023.",
"Mauritius blocks Norwegian cruise ship over fears of a potential cholera outbreak Local authorities on Sunday denied permission for the Norwegian Dawn ship, which has 2,184 passengers and 1,026 crew on board, to access the Mauritius capital of Port Louis, citing “potential health risks.” The Mauritius Ports Authority said Sunday that samples were taken from at least 15 passengers on board the cruise ship. A spokesperson for the U.S.-headquartered Norwegian Cruise Line Holdings said Sunday that 'a small number of guests experienced mild symptoms of a stomach-related illness' during Norwegian Dawns South Africa voyage.",
"Intuitive Machines lands on the moon in historic first for a U.S. company Intuitive Machines Nova-C cargo lander, named Odysseus after the mythological Greek hero, is the first U.S. spacecraft to soft land on the lunar surface since 1972. Intuitive Machines is the first company to pull off a moon landing — government agencies have carried out all previously successful missions. The company's stock surged in extended trading Thursday, after falling 11 percent in regular trading.",
"Lunar landing photos: Intuitive Machines Odysseus sends back first images from the moon Intuitive Machines cargo moon lander Odysseus returned its first images from the surface. Company executives believe the lander caught its landing gear sideways on the surface of the moon while touching down and tipped over. Despite resting on its side, the company's historic IM-1 mission is still operating on the moon.",
]
def docs_to_embeddings(docs: list) -> list:
openai = OpenAI()
document_embeddings = []
for doc in docs:
response = (
openai.embeddings.create(input=doc, model="text-embedding-3-small")
.data[0]
.embedding
)
document_embeddings.append(response)
return document_embeddings
article_embeddings = docs_to_embeddings(articles) # 注意: 通常、この処理は記事に対して一度だけ実行し、埋め込みとメタデータをデータベースに格納します
# 検索ステップにデコレーターを追加する
@weave.op()
def get_most_relevant_document(query):
openai = OpenAI()
query_embedding = (
openai.embeddings.create(input=query, model="text-embedding-3-small")
.data[0]
.embedding
)
similarities = [
np.dot(query_embedding, doc_emb)
/ (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
for doc_emb in article_embeddings
]
# 最も類似したドキュメントのインデックスを取得する
most_relevant_doc_index = np.argmax(similarities)
return articles[most_relevant_doc_index]
# アプリの詳細を含む Model サブクラスと、レスポンスを生成する predict 関数を作成する
class RAGModel(Model):
system_message: str
model_name: str = "gpt-3.5-turbo-1106"
@weave.op()
def predict(self, question: str) -> dict: # 注意: `question` は後で評価行からデータを選択するために使用されます
from openai import OpenAI
context = get_most_relevant_document(question)
client = OpenAI()
query = f"""Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know."
Context:
\"\"\"
{context}
\"\"\"
Question: {question}"""
response = client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": self.system_message},
{"role": "user", "content": query},
],
temperature=0.0,
response_format={"type": "text"},
)
answer = response.choices[0].message.content
return {'answer': answer, 'context': context}
# チームとプロジェクト名を設定する
weave.init('<team-name>/rag-quickstart')
model = RAGModel(
system_message="You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source."
)
# スコアリング関数は question と output を使用してスコアを生成します
@weave.op()
async def context_precision_score(question, output):
context_precision_prompt = """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.
Output in only valid JSON format.
question: {question}
context: {context}
answer: {answer}
verdict: """
client = OpenAI()
prompt = context_precision_prompt.format(
question=question,
context=output['context'],
answer=output['answer'],
)
response = client.chat.completions.create(
model="gpt-4-turbo-preview",
messages=[{"role": "user", "content": prompt}],
response_format={ "type": "json_object" }
)
response_message = response.choices[0].message
response = json.loads(response_message.content)
return {
"verdict": int(response["verdict"]) == 1,
}
questions = [
{"question": "What significant result was reported about Zealand Pharma's obesity trial?"},
{"question": "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?"},
{"question": "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?"},
{"question": "What were Rivian and Lucid's vehicle production forecasts for 2024?"},
{"question": "Why was the Norwegian Dawn cruise ship denied access to Mauritius?"},
{"question": "Which company achieved the first U.S. moon landing since 1972?"},
{"question": "What issue did Intuitive Machines' lunar lander encounter upon landing on the moon?"}
]
# Evaluation オブジェクトを定義し、サンプルの質問とスコアリング関数を渡す
evaluation = weave.Evaluation(dataset=questions, scorers=[context_precision_score])
asyncio.run(evaluation.evaluate(model))
require('dotenv').config();
import { OpenAI } from 'openai';
import * as weave from 'weave';
interface Article {
text: string;
embedding?: number[];
}
const articles: Article[] = [
{
text: `Novo Nordisk and Eli Lilly rival soars 32 percent after promising weight loss drug results Shares of Denmarks Zealand Pharma shot 32 percent higher in morning trade, after results showed success in its liver disease treatment survodutide, which is also on trial as a drug to treat obesity. The trial tells us that the 6mg dose is safe, which is the top dose used in the ongoing [Phase 3] obesity trial too, one analyst said in a note. The results come amid feverish investor interest in drugs that can be used for weight loss.`
},
{
text: `Berkshire shares jump after big profit gain as Buffetts conglomerate nears $1 trillion valuation Berkshire Hathaway shares rose on Monday after Warren Buffetts conglomerate posted strong earnings for the fourth quarter over the weekend. Berkshires Class A and B shares jumped more than 1.5%, each. Class A shares are higher by more than 17% this year, while Class B has gained more than 18%. Berkshire was last valued at $930.1 billion, up from $905.5 billion where it closed on Friday, according to FactSet. Berkshire on Saturday posted fourth-quarter operating earnings of $8.481 billion, about 28 percent higher than the $6.625 billion from the year-ago period, driven by big gains in its insurance business. Operating earnings refers to profits from businesses across insurance, railroads and utilities. Meanwhile, Berkshires cash levels also swelled to record levels. The conglomerate held $167.6 billion in cash in the fourth quarter, surpassing the $157.2 billion record the conglomerate held in the prior quarter.`
},
{
text: `Highmark Health says its combining tech from Google and Epic to give doctors easier access to information Highmark Health announced it is integrating technology from Google Cloud and the health-care software company Epic Systems. The integration aims to make it easier for both payers and providers to access key information they need, even if its stored across multiple points and formats, the company said. Highmark is the parent company of a health plan with 7 million members, a provider network of 14 hospitals and other entities`
}
];
function cosineSimilarity(a: number[], b: number[]): number {
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
const docsToEmbeddings = weave.op(async function(docs: Article[]): Promise<Article[]> {
const openai = new OpenAI();
const enrichedDocs = await Promise.all(docs.map(async (doc) => {
const response = await openai.embeddings.create({
input: doc.text,
model: "text-embedding-3-small"
});
return {
...doc,
embedding: response.data[0].embedding
};
}));
return enrichedDocs;
});
class RAGModel {
private openai: OpenAI;
private systemMessage: string;
private modelName: string;
private articleEmbeddings: Article[];
constructor(config: {
systemMessage: string;
modelName?: string;
articleEmbeddings: Article[];
}) {
this.openai = new OpenAI();
this.systemMessage = config.systemMessage;
this.modelName = config.modelName || "gpt-3.5-turbo-1106";
this.articleEmbeddings = config.articleEmbeddings;
this.predict = weave.op(this, this.predict);
}
private async getMostRelevantDocument(query: string): Promise<string> {
const queryEmbedding = await this.openai.embeddings.create({
input: query,
model: "text-embedding-3-small"
});
const similarities = this.articleEmbeddings.map(doc => {
if (!doc.embedding) return 0;
return cosineSimilarity(queryEmbedding.data[0].embedding, doc.embedding);
});
const mostRelevantIndex = similarities.indexOf(Math.max(...similarities));
return this.articleEmbeddings[mostRelevantIndex].text;
}
async predict(question: string): Promise<{
answer: string;
context: string;
}> {
const context = await this.getMostRelevantDocument(question);
const response = await this.openai.chat.completions.create({
model: this.modelName,
messages: [
{ role: "system", content: this.systemMessage },
{
role: "user",
content: `Use the following information to answer the subsequent question. If the answer cannot be found, write "I don't know."
Context:
"""
${context}
"""
Question: ${question}`
}
],
temperature: 0
});
return {
answer: response.choices[0].message.content || "",
context
};
}
}
interface ScorerResult {
verdict: boolean;
}
interface QuestionRow {
question: string;
}
function createQuestionDataset(): weave.Dataset<QuestionRow> {
return new weave.Dataset<QuestionRow>({
id: 'rag-questions',
rows: [
{ question: "What significant result was reported about Zealand Pharma's obesity trial?" },
{ question: "How much did Berkshire Hathaway's cash levels increase in the fourth quarter?" },
{ question: "What is the goal of Highmark Health's integration of Google Cloud and Epic Systems technology?" }
]
});
}
const contextPrecisionScore = weave.op(async function(args: {
datasetRow: QuestionRow;
modelOutput: { answer: string; context: string; }
}): Promise<ScorerResult> {
const openai = new OpenAI();
const prompt = `Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.
Output in only valid JSON format.
question: ${args.datasetRow.question}
context: ${args.modelOutput.context}
answer: ${args.modelOutput.answer}
verdict: `;
const response = await openai.chat.completions.create({
model: "gpt-4-turbo-preview",
messages: [{ role: "user", content: prompt }],
response_format: { type: "json_object" }
});
const result = JSON.parse(response.choices[0].message.content || "{}");
return {
verdict: parseInt(result.verdict) === 1
};
});
async function main() {
# チームとプロジェクト名を設定する
await weave.init('<team-name>/rag-quickstart');
const articleEmbeddings = await docsToEmbeddings(articles);
const model = new RAGModel({
systemMessage: "You are an expert in finance and answer questions related to finance, financial services, and financial markets. When responding based on provided information, be sure to cite the source.",
articleEmbeddings
});
const evaluation = new weave.Evaluation({
dataset: createQuestionDataset(),
scorers: [contextPrecisionScore]
});
const results = await evaluation.evaluate({
model: weave.op((args: { datasetRow: QuestionRow }) =>
model.predict(args.datasetRow.question)
)
});
console.log('Evaluation results:', results);
}
if (require.main === module) {
main().catch(console.error);
}