Christopher Román Jaimes
chore: decrease threshold.
abdb6f5
# Datetime
import datetime
# Manipulate
import re
import json
import pandas as pd
# App
import gradio as gr
# GLiNER Model
from gliner import GLiNER
# Transformers
from transformers import pipeline
# Load GLiNER Model
model = GLiNER.from_pretrained("chris32/gliner_multi_pii_real_state-v2")
model.eval()
# BERT Model
model_name = "chris32/distilbert-base-spanish-uncased-finetuned-text-intelligence"
pipe = pipeline(model = model_name, device = "cpu")
# Global Variables: For Post Cleaning Inferences
YEAR_OF_REMODELING_LIMIT = 100
CURRENT_YEAR = int(datetime.date.today().year)
SCORE_LIMIT_SIMILARITY_NAMES = 70
def clean_text(text):
# Replace HTML line breaks with the specified character
replacement_char = " # "
text = re.sub(r'<br\s*\/?>', replacement_char, text)
# Remove HTML tags and special characters
cleaned_text = re.sub(r'<[^>]*>', '', text)
cleaned_text = re.sub(r'&nbsp;', ' ', cleaned_text)
cleaned_text = re.sub(r'&amp;', '&', cleaned_text)
# Drop punctuation marks
#regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
#cleaned_text = re.sub(regex , ' ', cleaned_text)
# Replace multiple spaces with a single one
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
# Remove leading and trailing spaces
cleaned_text = cleaned_text.strip()
# Replace Duplicated "." and ","
cleaned_text = cleaned_text.replace("..", ".").replace(",,", ",")
return cleaned_text
def format_gliner_predictions(prediction):
if len(prediction) > 0:
# Select the Entity value with the Greater Score for each Entity Name
prediction_df = pd.DataFrame(prediction)\
.sort_values("score", ascending = False)\
.drop_duplicates(subset = "label", keep = "first")
# Add Position Column
prediction_df["position"] = prediction_df.apply(lambda x: (x["start"], x["end"]) ,axis = 1)
# Add Columns Label for Text and Probability
prediction_df["label_text"] = prediction_df["label"].apply(lambda x: f"pred_{x}")
prediction_df["label_prob"] = prediction_df["label"].apply(lambda x: f"prob_{x}")
prediction_df["label_position"] = prediction_df["label"].apply(lambda x: f"pos_{x}")
# Format Predictions
entities = prediction_df.set_index("label_text")["text"].to_dict()
entities_probs = prediction_df.set_index("label_prob")["score"].to_dict()
entities_positions = prediction_df.set_index("label_position")["position"].to_dict()
predictions_formatted = {**entities, **entities_probs, **entities_positions}
return predictions_formatted
else:
return dict()
def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
# Prediction and Probability
prediction = row[f"pred_{feature_name}"]
prob = row[f"prob_{feature_name}"]
# Clean and Return Prediction only if the Threshold is lower.
if prob > threshols_dict[feature_name]:
clean_function = clean_functions_dict[feature_name]
prediction_clean = clean_function(prediction)
return prediction_clean
else:
return None
surfaces_words_to_omit = ["ha", "hect", "lts", "litros", "mil"]
tower_name_key_words_to_keep = ["torr", "towe"]
def has_number(string):
return bool(re.search(r'\d', string))
def contains_multiplication(string):
# Regular expression pattern to match a multiplication operation
pattern = r'\b([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*[xX]\s*([\d,]+(?:\.\d+)?)\s*(?:\w+\s*)*\b'
# Search for the pattern in the string
match = re.search(pattern, string)
# If a match is found, return True, otherwise False
if match:
return True
else:
return False
def extract_first_number_from_string(text):
if isinstance(text, str):
match = re.search(r'\b\d*\.?\d+\b|\d*\.?\d+', text)
if match:
start_pos = match.start()
end_pos = match.end()
number = int(float(match.group()))
return number, start_pos, end_pos
else:
return None, None, None
else:
return None, None, None
def get_character(string, index):
if len(string) > index:
return string[index]
else:
return None
def find_valid_comma_separated_number(string):
# This regular expression matches strings starting with 1 to 3 digits followed by a comma and 3 digits. It ensures no other digits or commas follow or the string ends.
match = re.match(r'^(\d{1,3},\d{3})(?:[^0-9,]|$)', string)
if match:
valid_number = int(match.group(1).replace(",", ""))
return valid_number
else:
return None
def extract_surface_from_string(string: str) -> int:
if isinstance(string, str):
# 1. Validate if it Contains a Number
if not(has_number(string)): return None
# 2. Validate if it No Contains Multiplication
if contains_multiplication(string): return None
# 3. Validate if it No Contains Words to Omit
if any([word in string.lower() for word in surfaces_words_to_omit]): return None
# 4. Extract First Number
number, start_pos, end_pos = extract_first_number_from_string(string)
# 5. Extract Valid Comma Separated Number
if isinstance(number, int):
if get_character(string, end_pos) == ",":
valid_comma_separated_number = find_valid_comma_separated_number(string[start_pos: -1])
return valid_comma_separated_number
else:
return number
else:
return None
else:
return None
def clean_prediction(row, feature_name, threshols_dict, clean_functions_dict):
# Prediction and Probability
prediction = row[f"pred_{feature_name}"]
prob = row[f"prob_{feature_name}"]
# Clean and Return Prediction only if the Threshold is lower.
if prob > threshols_dict[feature_name]:
clean_function = clean_functions_dict[feature_name]
prediction_clean = clean_function(prediction)
return prediction_clean
else:
return None
def extract_remodeling_year_from_string(string):
if isinstance(string, str):
# 1. Detect 4-digit year
match = re.search(r'\b\d{4}\b', string)
if match:
year_predicted = int(match.group())
else:
# 2. Detect quantity of years followed by "year", "years", "anio", "año", or "an"
match = re.search(r'(\d+) (year|years|anio|año|an|añ)', string.lower(), re.IGNORECASE)
if match:
past_years_predicted = int(match.group(1))
year_predicted = CURRENT_YEAR - past_years_predicted
else:
return None
# 3. Detect if it is a valid year
is_valid_year = (year_predicted <= CURRENT_YEAR) and (YEAR_OF_REMODELING_LIMIT > CURRENT_YEAR - year_predicted)
return year_predicted if is_valid_year else None
return None
def extract_valid_string_left_dotted(string, text, pos):
if isinstance(string, str):
# String Position
left_pos, rigth_pos = pos
# Verify if the Left Position is not too close to the beginning of the text.
if left_pos < 5:
return None
if string[0].isdigit():
# 1. Take a subtext with 5 more characters to the left of the string.
sub_text = text[left_pos - 5: rigth_pos]
# 2. If the string has no dots to the left, return the original string.
if text[left_pos - 1] == ".":
# 3. If the string has a left dot but no preceding digit, return the original string.
if text[left_pos - 2].isdigit():
# 4. If the string has a left dot, with 3 left digits, and the fourth left value isn't ',', '.', or "''", it returns the new string.
pattern = r'^(?![\d.,])\D*\d{1,3}\.' + re.escape(string)
match = re.search(pattern, sub_text)
if match:
return match.group(0)
else:
return None
else:
return string
else:
return string
else:
return string
else:
return None
# Cleaning
clean_functions_dict = {
"SUPERFICIE_TERRAZA": extract_surface_from_string,
"SUPERFICIE_JARDIN": extract_surface_from_string,
"SUPERFICIE_TERRENO": extract_surface_from_string,
"SUPERFICIE_HABITABLE": extract_surface_from_string,
"SUPERFICIE_BALCON": extract_surface_from_string,
"AÑO_REMODELACIÓN": extract_remodeling_year_from_string,
"NOMBRE_COMPLETO_ARQUITECTO": lambda x: x,
'NOMBRE_CLUB_GOLF': lambda x: x,
'NOMBRE_TORRE': lambda x: x,
'NOMBRE_CONDOMINIO': lambda x: x,
'NOMBRE_DESARROLLO': lambda x: x,
}
threshols_dict = {
"SUPERFICIE_TERRAZA": 0.9,
"SUPERFICIE_JARDIN": 0.9,
"SUPERFICIE_TERRENO": 0.9,
"SUPERFICIE_HABITABLE": 0.9,
"SUPERFICIE_BALCON": 0.9,
"AÑO_REMODELACIÓN": 0.9,
"NOMBRE_COMPLETO_ARQUITECTO": 0.9,
'NOMBRE_CLUB_GOLF': 0.9,
'NOMBRE_TORRE': 0.9,
'NOMBRE_CONDOMINIO': 0.9,
'NOMBRE_DESARROLLO': 0.9,
}
threshols_dict = {
"SUPERFICIE_BALCON": 0.7697697697697697,
"SUPERFICIE_TERRAZA": 0.953953953953954,
"SUPERFICIE_JARDIN": 0.9519519519519519, #idk
"SUPERFICIE_TERRENO": 0.980980980980981 - 0.05,
"SUPERFICIE_HABITABLE": 0.978978978978979 - 0.02, #idk if not "SUPERFICIE_HABITABLE": 0.988988988988989,
"AÑO_REMODELACIÓN": 0.996996996996997 - 0.01,
"NOMBRE_COMPLETO_ARQUITECTO": 0.8878878878878879,
"NOMBRE_CLUB_GOLF": 0.8708708708708709, #idk if not "NOMBRE_CLUB_GOLF": 0.9729729729729729,
"NOMBRE_TORRE": 0.8458458458458459 - 0.04,
"NOMBRE_CONDOMINIO": 0.965965965965966,
"NOMBRE_DESARROLLO": 0.9229229229229229
}
label_names_dict = {
'LABEL_0': None,
'LABEL_1': 1,
'LABEL_2': 2,
'LABEL_3': 3,
}
BERT_SCORE_LIMIT = 0.980819808198082
def extract_max_label_score(probabilities):
# Find the dictionary with the maximum score
max_item = max(probabilities, key=lambda x: x['score'])
# Extract the label and the score
label = max_item['label']
score = max_item['score']
return label, score
def clean_prediction_bert(label, score):
if score > BERT_SCORE_LIMIT:
label_formatted = label_names_dict.get(label, None)
return label_formatted
else:
return None
# BERT Inference Config
pipe_config = {
"batch_size": 8,
"truncation": True,
"max_length": 250,
"add_special_tokens": True,
"return_all_scores": True,
"padding": True,
}
def generate_answer(text):
labels = [
'SUPERFICIE_JARDIN',
'NOMBRE_CLUB_GOLF',
'SUPERFICIE_TERRENO',
'SUPERFICIE_HABITABLE',
'SUPERFICIE_TERRAZA',
'NOMBRE_COMPLETO_ARQUITECTO',
'SUPERFICIE_BALCON',
'NOMBRE_DESARROLLO',
'NOMBRE_TORRE',
'NOMBRE_CONDOMINIO',
'AÑO_REMODELACIÓN'
]
# Clean Text
text = clean_text(text)
# Inference
entities = model.predict_entities(text, labels, threshold=0.4)
# Format Prediction Entities
entities_formatted = format_gliner_predictions(entities)
# Extract valid string left dotted
feature_surfaces = ['SUPERFICIE_BALCON', 'SUPERFICIE_TERRAZA', 'SUPERFICIE_JARDIN', 'SUPERFICIE_TERRENO', 'SUPERFICIE_HABITABLE']
for feature_name in feature_surfaces:
if entities_formatted.get(f"pred_{feature_name}", None) != None:
entities_formatted[f"pred_{feature_name}"] = extract_valid_string_left_dotted(entities_formatted[f"pred_{feature_name}"], text, entities_formatted[f"pos_{feature_name}"])
# Clean Entities
entities_names = list({c.replace("pred_", "").replace("prob_", "").replace("pos_", "") for c in list(entities_formatted.keys())})
entities_cleaned = dict()
for feature_name in entities_names:
entity_prediction_cleaned = clean_prediction(entities_formatted, feature_name, threshols_dict, clean_functions_dict)
if isinstance(entity_prediction_cleaned, str) or isinstance(entity_prediction_cleaned, int):
entities_cleaned[feature_name] = entity_prediction_cleaned
# BERT Inference
predictions = pipe([text], **pipe_config)
# Format Prediction
label, score = extract_max_label_score(predictions[0])
entities_formatted["NIVELES_CASA"] = label
entities_formatted["prob_NIVELES_CASA"] = score
prediction_cleaned = clean_prediction_bert(label, score)
if isinstance(prediction_cleaned, int):
entities_cleaned["NIVELES_CASA"] = prediction_cleaned
result_json = json.dumps(entities_cleaned, indent = 4, ensure_ascii = False)
return "Clean Result:" + result_json + "\n \n" + "Raw Result:" + json.dumps(entities_formatted, indent = 4, ensure_ascii = False)
# Cambiar a entrada de texto
#text_input = gr.inputs.Textbox(lines=15, label="Input Text")
iface = gr.Interface(
fn=generate_answer,
inputs="text",
outputs="text",
title="Text Intelligence for Real State",
description="Input text describing the property."
)
iface.launch()