ClinicalTrialV3 / OpenAITools /FetchTools.py
Satoc's picture
Add application file
92df76e
raw
history blame
7.79 kB
import os
import pandas as pd
#from llama_index.llms.replicate import Replicate
import requests
import re
def extract_japan_cities(text):
# 正規葨現を使用して " - Japan" γ§η΅‚γ‚γ‚‹ιƒ½εΈ‚εγ‚’ζŠ½ε‡Ί
pattern = r'(\b\w+\s*\w*\b) - Japan'
cities = re.findall(pattern, text)
unique_cities = list(set(cities))
# ユニークγͺιƒ½εΈ‚εγ‚’γ‚½γƒΌγƒˆγ—γ¦γ‚«γƒ³γƒžγ§εŒΊεˆ‡γ‚‰γ‚ŒγŸζ–‡ε­—εˆ—γ«ε€‰ζ›
unique_cities.sort()
return ', '.join(unique_cities)
def fetch_clinical_trials(cancer_name):
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
"query.titles": search_expr,
"pageSize": 100
}
# Initialize an empty list to store the data
data_list = []
# Loop until there is no nextPageToken
while True:
# Print the current URL (for debugging purposes)
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
# Send a GET request to the API
response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
data = response.json() # Parse JSON response
studies = data.get('studies', []) # Extract the list of studies
# Loop through each study and extract specific information
for study in studies:
# Safely access nested keys
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
# Extract locations safely
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
JapanesLocations = extract_japan_cities(locations)
# Extract dates and phases
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
# Append the data to the list as a dictionary
data_list.append({
"NCTID": nctId,
"Title": title,
#"Start Date": startDate,
"Primary Completion Date": primaryCompletionDate,
#"Conditions": conditions,
"Cancer": conditions,
"Summary": summary,
"Japanes Locations": JapanesLocations,
#"Phases": phases,
"Eligibility Criteria": eligibilityCriteria
})
# Check for nextPageToken and update the params or break the loop
nextPageToken = data.get('nextPageToken')
if nextPageToken:
params['pageToken'] = nextPageToken # Set the pageToken for the next request
else:
break # Exit the loop if no nextPageToken is present
else:
print("Failed to fetch data. Status code:", response.status_code)
break
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
return df
def fetch_clinical_trials_jp(cancer_name):
search_expr="%s SEARCH[Location](AREA[LocationCountry]Japan AND AREA[LocationStatus]Recruiting)" % (cancer_name)
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
"query.titles": search_expr,
"pageSize": 100
}
# Initialize an empty list to store the data
data_list = []
# Loop until there is no nextPageToken
while True:
# Print the current URL (for debugging purposes)
print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))
# Send a GET request to the API
response = requests.get(base_url, params=params)
# Check if the request was successful
if response.status_code == 200:
data = response.json() # Parse JSON response
studies = data.get('studies', []) # Extract the list of studies
# Loop through each study and extract specific information
for study in studies:
# Safely access nested keys
nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
title = study['protocolSection']['identificationModule'].get('briefTitle', 'no title')
summary = study['protocolSection']['descriptionModule'].get('briefSummary', 'no summary')
# Extract locations safely
locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"
JapanesLocations = extract_japan_cities(locations)
# Extract dates and phases
primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
eligibilityCriteria = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')
# Append the data to the list as a dictionary
data_list.append({
"NCTID": nctId,
"γ‚Ώγ‚€γƒˆγƒ«": title,
#"Start Date": startDate,
#"Primary Completion Date": primaryCompletionDate,
"対豑とγͺγ‚‹η™Œ": conditions,
"γ‚΅γƒžγƒͺγƒΌ": summary,
"場所": JapanesLocations,
#"Phases": phases,
"クラむテγƒͺγ‚’": eligibilityCriteria
})
# Check for nextPageToken and update the params or break the loop
nextPageToken = data.get('nextPageToken')
if nextPageToken:
params['pageToken'] = nextPageToken # Set the pageToken for the next request
else:
break # Exit the loop if no nextPageToken is present
else:
print("Failed to fetch data. Status code:", response.status_code)
break
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
return df