bioclip-demo

Sleeping

App Files Files Community

bioclip-demo / components /query.py

thompsonmj

Retrieve example TOL-10M image and representative EOL page for OE prediction

6277b48 4 months ago

raw

history blame

4.5 kB

	import io
	import boto3
	import requests
	import numpy as np
	import polars as pl
	from PIL import Image
	from botocore.config import Config
	import logging

	logger = logging.getLogger(__name__)

	# S3 for sample images
	my_config = Config(
	region_name='us-east-1'
	)
	s3_client = boto3.client('s3', config=my_config)

	# Set basepath for EOL pages for info
	EOL_URL = "https://eol.org/pages/"
	RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]

	def get_sample(df, pred_taxon, rank):
	'''
	Function to retrieve a sample image of the predicted taxon and EOL page link for more info.

	Parameters:
	-----------
	df : DataFrame
	DataFrame with all sample images listed and their filepaths (in "file_path" column).
	pred_taxon : str
	Predicted taxon of the uploaded image.
	rank : int
	Index of rank in RANKS chosen for prediction.

	Returns:
	--------
	img : PIL.Image
	Sample image of predicted taxon for display.
	eol_page : str
	URL to EOL page for the taxon (may be a lower rank, e.g., species sample).
	'''
	logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
	try:
	filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
	except Exception as e:
	logger.error(f"Error retrieving sample data: {e}")
	return None, f"We encountered the following error trying to retrieve a sample image: {e}."
	if filepath is None:
	logger.warning(f"No sample image found for taxon: {pred_taxon}")
	return None, f"Sorry, our EOL images do not include {pred_taxon}."

	# Get sample image of selected individual
	try:
	img_src = s3_client.generate_presigned_url('get_object',
	Params={'Bucket': 'treeoflife-10m-sample-images',
	'Key': filepath}
	)
	img_resp = requests.get(img_src)
	img = Image.open(io.BytesIO(img_resp.content))
	if is_exact:
	eol_page = f"Check out the EOL entry for {pred_taxon} to learn more: {EOL_URL}{eol_page_id}."
	else:
	eol_page = f"Check out an example EOL entry within {pred_taxon} to learn more: {full_name} {EOL_URL}{eol_page_id}."
	logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}")
	return img, eol_page
	except Exception as e:
	logger.error(f"Error retrieving sample image: {e}")
	return None, f"We encountered the following error trying to retrieve a sample image: {e}."

	def get_sample_data(df, pred_taxon, rank):
	'''
	Function to randomly select a sample individual of the given taxon and provide associated native location.

	Parameters:
	-----------
	df : DataFrame
	DataFrame with all sample images listed and their filepaths (in "file_path" column).
	pred_taxon : str
	Predicted taxon of the uploaded image.
	rank : int
	Index of rank in RANKS chosen for prediction.

	Returns:
	--------
	filepath : str
	Filepath of selected sample image for predicted taxon.
	eol_page_id : str
	EOL page ID associated with predicted taxon for more information.
	full_name : str
	Full taxonomic name of the selected sample.
	is_exact : bool
	Flag indicating if the match is exact (i.e., with empty lower ranks).
	'''
	for idx in range(rank + 1):
	taxon = RANKS[idx]
	target_taxon = pred_taxon.split(" ")[idx]
	df = df.filter(pl.col(taxon) == target_taxon)

	if df.shape[0] == 0:
	return None, np.nan, "", False

	# First, try to find entries with empty lower ranks
	exact_df = df
	for lower_rank in RANKS[rank + 1:]:
	exact_df = exact_df.filter((pl.col(lower_rank).is_null()) \| (pl.col(lower_rank) == ""))

	if exact_df.shape[0] > 0:
	df_filtered = exact_df.sample()
	full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
	return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True

	# If no exact matches, return any entry with the specified rank
	df_filtered = df.sample()
	full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
	return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False