cooccurrence_count / cooccurrence_count.py
Prabin Bhandari
Use list of words
1861797
# Copyright 2020 The HuggingFace Datasets Authors and the current
# dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Get the co-occurance count for two words in each sentece in a dataset.
"""
import evaluate
import datasets
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import stanza
_DESCRIPTION = """\
Returns the co-occurrence count of two words in the input.
"""
_CITATION = ""
_KWARGS_DESCRIPTION = """
Calculates the co-occurence of two words in each sentence.
Args:
`data`: a list of `str` which containes a dataset.
`words`: list of list of two words that we want to check for
Returns:
Examples:
>>> data = ["hello sun","hello moon", "hello sun"]
>>> c_count = evaluate.load("prb977/cooccurrence_count")
>>> results = c_count.compute(data=data, words=[['hello','sun']\)
>>> print(results)
[['hello','sun',3,2]]
"""
def check_count(x):
if x[0].all() <= 0:
return 0
return 1
nlp = stanza.Pipeline(lang='en', processors='tokenize')
def stanza_tokenizer(sen):
doc = nlp(sen)
tokens = []
for sen in doc.sentences:
for token in sen.tokens:
tokens.append(token.text)
return tokens
@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION,
_KWARGS_DESCRIPTION
)
class CooccurrenceCount(evaluate.Measurement):
"""This measurement returns the co-occurrence count of two words."""
def _info(self):
return evaluate.MeasurementInfo(
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
'data': datasets.Value('string')
}),
)
def _download_and_prepare(self, dl_manager):
stanza.download('en', processors='tokenize')
def _compute(self, data, words):
for each in words:
word1 = each[0]
word2 = each[1]
print(word1)
print(word2)
len1 = len(stanza_tokenizer(word1))
len2 = len(stanza_tokenizer(word2))
if len1 > len2:
ugram = len1
lgram = len2
elif len1 < len2:
ugram = len2
lgram = len1
else:
ugram = len1
lgram = len1
v = CountVectorizer(
ngram_range=(lgram, ugram),
tokenizer=stanza_tokenizer,
lowercase=True
)
analyzer = v.build_analyzer()
vectorizer = CountVectorizer(
ngram_range=(lgram, ugram),
vocabulary={
analyzer(word1)[-1]: 0,
analyzer(word2)[-1]: 1
},
tokenizer=stanza_tokenizer,
lowercase=True
)
co_occurrences = vectorizer.fit_transform(data)
dense_mat = co_occurrences.todense()
count = len(data)
co_occurrence_count = np.sum(
np.apply_along_axis(check_count, axis=1, arr=dense_mat)
)
each.append(count)
each.append(co_occurrence_count)
return words