Audio transcription
In [1]:
Copied!
from indexify import IndexifyClient
client = IndexifyClient(service_url="http://100.114.163.163:8900")
from indexify import IndexifyClient
client = IndexifyClient(service_url="http://100.114.163.163:8900")
!pip install pandas
In [78]:
Copied!
import pandas
data = pandas.read_csv("~/Downloads/oral-arguments-2022-08-03.csv")
len(data)
import pandas
data = pandas.read_csv("~/Downloads/oral-arguments-2022-08-03.csv")
len(data)
/var/folders/lr/j5bzp2894pq9hv33bhl4hkhr0000gn/T/ipykernel_13820/3150347646.py:2: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False. data = pandas.read_csv("~/Downloads/oral-arguments-2022-08-03.csv")
Out[78]:
79719
Install indexify -
curl https://www.tensorlake.ai | sh
indexify server -d
Download Whisper extractor -
indexify-extractor download hub://audio/whisper-asr
cd indexify-extractor
indexify-extractor join-server whisper:WhisperExtractor
Or run with docker
docker run -d -v /tmp/indexify-blob-storage:/tmp/indexify-blob-storage -p 9500:9500 tensorlake/whisper-asr join-server --workers=1 --coordinator-addr=host.docker.internal:8950 --ingestion-addr=host.docker.internal:8900 --advertise-addr=0.0.0.0:9500
In [81]:
Copied!
data = data[["download_url", "docket_id"]]
non_empty_download_urls =data[data["download_url"].str.startswith("http")]
print(len(non_empty_download_urls))
unique_urls = non_empty_download_urls.drop_duplicates(subset=['download_url'])
len(unique_urls)
data = data[["download_url", "docket_id"]]
non_empty_download_urls =data[data["download_url"].str.startswith("http")]
print(len(non_empty_download_urls))
unique_urls = non_empty_download_urls.drop_duplicates(subset=['download_url'])
len(unique_urls)
79719
Out[81]:
79539
In [127]:
Copied!
for index, row in unique_urls.head(100).iterrows():
client.ingest_remote_file(url=row["download_url"], mime_type="audio/mpeg", labels={"docket_id": str(row["docket_id"])})
for index, row in unique_urls.head(100).iterrows():
client.ingest_remote_file(url=row["download_url"], mime_type="audio/mpeg", labels={"docket_id": str(row["docket_id"])})
In [110]:
Copied!
client.add_extraction_policy(extractor='tensorlake/whisper-asr', name="audio-transcription")
client.add_extraction_policy(extractor='tensorlake/whisper-asr', name="audio-transcription")
In [ ]:
Copied!