Explore, process, and version data with your favorite tools
from oxen import RemoteRepo repo = RemoteRepo("datasets/GettingStarted") repo.download("tables/llm_fine_tune.jsonl")
import pandas as pd # URL Format: https://hub.oxen.ai/api/repos/{username}/{repo_name}/file/{revision}/{file_path} url = "https://hub.oxen.ai/api/repos/datasets/GettingStarted/file/main/tables/llm_fine_tune.jsonl" # FSSpec Format: oxen://{username}:{repo_name}@{revision}/{path} url = "oxen://datasets:GettingStarted@main/tables/llm_fine_tune.jsonl" df = pd.read_json(url, lines=True)
model
import matplotlib.pyplot as plt category_counts = data_frame['model'].value_counts() category_counts.plot(kind='bar') plt.title('Model Distribution') plt.xlabel('Models') plt.ylabel('Counts') plt.gca()
Internal Thoughts
response
df['response'] = df['response'].replace(to_replace=r'^(Internal Thoughts|\*Internal Thoughts:\*|\*\*Internal Thoughts:\*\*).*', value='', regex=True)
pandas
RemoteRepo
df.write_parquet("oxen://datasets:GettingStarted@main/tables/llm_fine_tune.jsonl", index=False)
from oxen import RemoteRepo repo = RemoteRepo("datasets/GettingStarted") repo.add("llm_fine_tune.jsonl", dst="tables") repo.commit("Cleaned 'Internal thoughts' string from the start of the response column")