Spaces:
Runtime error
Runtime error
from nltk.corpus import wordnet | |
import re | |
from nltk.stem import WordNetLemmatizer | |
stop_words = ['i', | |
'me', | |
'my', | |
'myself', | |
'we', | |
'our', | |
'ours', | |
'ourselves', | |
'you', | |
"you're", | |
"you've", | |
"you'll", | |
"you'd", | |
'your', | |
'yours', | |
'yourself', | |
'yourselves', | |
'he', | |
'him', | |
'his', | |
'himself', | |
'she', | |
"she's", | |
'her', | |
'hers', | |
'herself', | |
'it', | |
"it's", | |
'its', | |
'itself', | |
'they', | |
'them', | |
'their', | |
'theirs', | |
'themselves', | |
'what', | |
'which', | |
'who', | |
'whom', | |
'this', | |
'that', | |
"that'll", | |
'these', | |
'those', | |
'am', | |
'is', | |
'are', | |
'was', | |
'were', | |
'be', | |
'been', | |
'being', | |
'have', | |
'has', | |
'had', | |
'having', | |
'do', | |
'does', | |
'did', | |
'doing', | |
'a', | |
'an', | |
'the', | |
'and', | |
'but', | |
'if', | |
'or', | |
'because', | |
'as', | |
'until', | |
'while', | |
'of', | |
'at', | |
'by', | |
'for', | |
'with', | |
'about', | |
'against', | |
'between', | |
'into', | |
'through', | |
'during', | |
'before', | |
'after', | |
'above', | |
'below', | |
'to', | |
'from', | |
'up', | |
'down', | |
'in', | |
'out', | |
'on', | |
'off', | |
'over', | |
'under', | |
'again', | |
'further', | |
'then', | |
'once', | |
'here', | |
'there', | |
'when', | |
'where', | |
'why', | |
'how', | |
'all', | |
'any', | |
'both', | |
'each', | |
'few', | |
'more', | |
'most', | |
'other', | |
'some', | |
'such', | |
'no', | |
'nor', | |
'not', | |
'only', | |
'own', | |
'same', | |
'so', | |
'than', | |
'too', | |
'very', | |
's', | |
't', | |
'can', | |
'will', | |
'just', | |
'don', | |
"don't", | |
'should', | |
"should've", | |
'now', | |
'd', | |
'll', | |
'm', | |
'o', | |
're', | |
've', | |
'y', | |
'ain', | |
'aren', | |
"aren't", | |
'couldn', | |
"couldn't", | |
'didn', | |
"didn't", | |
'doesn', | |
"doesn't", | |
'hadn', | |
"hadn't", | |
'hasn', | |
"hasn't", | |
'haven', | |
"haven't", | |
'isn', | |
"isn't", | |
'ma', | |
'mightn', | |
"mightn't", | |
'mustn', | |
"mustn't", | |
'needn', | |
"needn't", | |
'shan', | |
"shan't", | |
'shouldn', | |
"shouldn't", | |
'wasn', | |
"wasn't", | |
'weren', | |
"weren't", | |
'won', | |
"won't", | |
'wouldn', | |
"wouldn't", | |
"its", | |
"whats", | |
"im", | |
"youre", | |
"hes", | |
"shes", | |
"were", | |
"theyre", | |
"cant", | |
"dont", | |
"wont", | |
"isnt", | |
"arent", | |
"wasnt", | |
"werent", | |
"couldnt", | |
"shouldnt", | |
"wouldnt", | |
"ive", | |
"youve", | |
"weve", | |
"theyve", | |
"id", | |
"youd", | |
"lets", | |
"thats", | |
"theres", | |
"heres", | |
"ill", | |
"hell", | |
"shell", | |
"mustnt", | |
"mightnt", | |
"shant", | |
"neednt", | |
"oclock", | |
"cause", | |
"gimme", | |
"wanna", | |
"gonna", | |
"kinda", | |
"sorta", | |
"lemme", | |
"aint", | |
"dunno", | |
"gotta", | |
"yall"] | |
# Create a lemmatizer object | |
lemmatizer = WordNetLemmatizer() | |
#from english_words import get_english_words_set | |
#web2lowerset = get_english_words_set(['web2'], lower=True) | |
# Define the Unicode range for Hindi letters | |
HINDI_UNICODE_RANGE = (0x0900, 0x097F) | |
# Function to check if a given character is a Hindi letter | |
def is_hindi_letter(c): | |
return ord(c) >= HINDI_UNICODE_RANGE[0] and ord(c) <= HINDI_UNICODE_RANGE[1] | |
# In[8]: | |
def en_hi_detection(text): | |
text = re.sub(r'[^\w\s]', ' ', text) | |
words = text.lower().strip().split() | |
count_en = 0 | |
# Lemmatize words for all POS | |
for word in words: | |
for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]: | |
# print(f"{word} ({pos}): {lemmatizer.lemmatize(word, pos)}") | |
lem_word = lemmatizer.lemmatize(word, pos) | |
if lem_word in wordnet.words(): | |
print("wordnet :",lem_word) | |
count_en+=1 | |
break | |
elif lem_word in stop_words: | |
print("stop_words :",lem_word) | |
count_en+=1 | |
break | |
#print("total english words found :", count_en) | |
#print("length of sentence :", len(words)) | |
#print(count_en/len(words)*100, "% english words found") | |
count = 0 | |
# Check each word for Hindi letters and print the results | |
for word in words: | |
hindi_letters = [] | |
for c in word: | |
if is_hindi_letter(c): | |
hindi_letters.append(c) | |
if hindi_letters: | |
#print(f"Word '{word}' contains Hindi letters: {' '.join(hindi_letters)}") | |
count+=1 | |
else: | |
pass | |
#print(f"Word '{word}' does not contain any Hindi letters.") | |
#print(count/len(words)*100, "% Hindi words found") | |
if count_en/len(words)*100>70: | |
return "eng" | |
elif count/len(words)*100>75: | |
return "hi" | |
else : | |
return "unknown" | |