Before diving in, let's take a moment to review the key resources and foundational concepts that will guide us through this blog. That will ensure we're well-equipped to follow along. This brief review will provide a strong starting point for exploring the main topics ahead.
Our hands-on learning will be developing an AI-based solution that helps the user extract financial information and insights from investment/finance books and newspaper in our database.
The process is divided into three main parts:
{"messages": [{"role": "system", "content": "content goes here"}, {"role": "user", "content": "query goes here?"}, {"role": "assistant", "content": "response goes here."}]}.
To satisfy this, we prepared our two data sets training_data.jsonl and val_data.jsonl for training and validation, respectively.
Below is an example of our training_data.jsonl:
{"messages": [{"role": "system", "content": "You are a helper in finance and an advisor in investment"}, {"role": "user", "content": "What makes a successful startup?"}, {"role": "assistant", "content": "Luck, hard work, and maybe skills... mostly luck, hard work, and consistency."}]}
Both data files are attached to this blog. They were collected manually through some examples.
import json
import tiktoken
import numpy as np
from collections import defaultdict
encoding = tiktoken.get_encoding("cl100k_base")
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
num_tokens = 0
for message in messages:
if not isinstance(message, dict):
print(f"Unexpected message format: {message}")
continue
num_tokens += tokens_per_message
for key, value in message.items():
if not isinstance(value, str):
print(f"Unexpected value type for key '{key}': {value}")
continue
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3
return num_tokens
def num_assistant_tokens_from_messages(messages):
num_tokens = 0
for message in messages:
if not isinstance(message, dict):
print(f"Unexpected message format: {message}")
continue
if message.get("role") == "assistant":
content = message.get("content", "")
if not isinstance(content, str):
print(f"Unexpected content type: {content}")
continue
num_tokens += len(encoding.encode(content))
return num_tokens
def print_distribution(values, name):
if values:
print(f"\n#### Distribution of {name}:")
print(f"min / max: {min(values)}, {max(values)}")
print(f"mean / median: {np.mean(values)}, {np.median(values)}")
print(f"p5 / p95: {np.quantile(values, 0.05)}, {np.quantile(values, 0.95)}")
else:
print(f"No values to display for {name}")
files = [
r'train_data.jsonl',
r'val_data.jsonl'
]
for file in files:
print(f"Processing file: {file}")
try:
with open(file, 'r', encoding='utf-8') as f:
total_tokens = []
assistant_tokens = []
for line in f:
try:
ex = json.loads(line)
messages = ex.get("messages", [])
if not isinstance(messages, list):
raise ValueError("The 'messages' field should be a list.")
total_tokens.append(num_tokens_from_messages(messages))
assistant_tokens.append(num_assistant_tokens_from_messages(messages))
except json.JSONDecodeError:
print(f"Error decoding JSON line: {line}")
except ValueError as ve:
print(f"ValueError: {ve} - line: {line}")
except Exception as e:
print(f"Unexpected error processing line: {e} - line: {line}")
if total_tokens and assistant_tokens:
print_distribution(total_tokens, "total tokens")
print_distribution(assistant_tokens, "assistant tokens")
else:
print("No valid data to process.")
print('*' * 50)
except FileNotFoundError:
print(f"File not found: {file}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
# Initialize AzureOpenAI client
client = AzureOpenAI(
azure_endpoint=azure_oai_endpoint,
api_key=azure_oai_key,
api_version=version # Ensure this API version is correct
)
training_file_name = r'path’
validation_file_name = r'path’
try:
# Upload the training dataset file
with open(training_file_name, "rb") as file:
training_response = client.files.create(
file=file, purpose="fine-tune"
)
training_file_id = training_response.id
print("Training file ID:", training_file_id)
except Exception as e:
print(f"Error uploading training file: {e}")
try:
# Upload the validation dataset file
with open(validation_file_name, "rb") as file:
validation_response = client.files.create(
file=file, purpose="fine-tune"
)
validation_file_id = validation_response.id
print("Validation file ID:", validation_file_id)
except Exception as e:
print(f"Error uploading validation file: {e}")
The concept here is to rely on the model's knowledge + users’ documentation. We have two options and both provide high precision for responses:
Also, for integration, we have two ways we may follow: through the Azure OpenAI User Interface and deploying into an Azure static web app or develop your own web app and use the Azure SDK to integrate your model.
1- Deploying into Azure static web app
2- Develop your own web App and use Azure SDK
load_dotenv ()
azure_oai_endpoint = os.getenv("AZURE_OAI_FINETUNE_ENDPOINT2")
azure_oai_key = os.getenv("AZURE_OAI_FINETUNE_KEY2")
azure_oai_deployment = os.getenv("AZURE_OAI_FINETUNE_DEPLOYMENT2")
azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")
azure_search_index = os.getenv("AZURE_SEARCH_INDEX")
client = AzureOpenAI(
base_url=f"{azure_oai_endpoint}/openai/deployments/{azure_oai_deployment}/extensions",
api_key=azure_oai_key,
api_version="2023-09-01-preview)
extension_config = dict(
dataSources= [
{
"type": "AzureCognitiveSearch",
"parameters": {
"endpoint": azure_search_endpoint,
"key": azure_search_key,
"indexName": azure_search_index,
}
}
]
)
RAG is used to enhance a model's capabilities by adding more grounded information, not to eliminate the model’s internal knowledge.
Some issues that you may face during development:
Next, you can look at how to do real-time injection so that you personalize more of the responses. Try to find how to rely between your web app, the user's input I/O, the searching index, and LLM.
Keyword: Langchain, Databricks
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.