# Copyright 2020 The HuggingFace Datasets Authors and the current # dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Get the co-occurance count for two words in each sentece in a dataset. """ import evaluate import datasets from sklearn.feature_extraction.text import CountVectorizer import numpy as np import stanza _DESCRIPTION = """\ Returns the co-occurrence count of two words in the input. """ _CITATION = "" _KWARGS_DESCRIPTION = """ Calculates the co-occurence of two words in each sentence. Args: `data`: a list of `str` which containes a dataset. `words`: list of list of two words that we want to check for Returns: Examples: >>> data = ["hello sun","hello moon", "hello sun"] >>> c_count = evaluate.load("prb977/cooccurrence_count") >>> results = c_count.compute(data=data, words=[['hello','sun']\) >>> print(results) [['hello','sun',3,2]] """ def check_count(x): if x[0].all() <= 0: return 0 return 1 nlp = stanza.Pipeline(lang='en', processors='tokenize') def stanza_tokenizer(sen): doc = nlp(sen) tokens = [] for sen in doc.sentences: for token in sen.tokens: tokens.append(token.text) return tokens @evaluate.utils.file_utils.add_start_docstrings( _DESCRIPTION, _KWARGS_DESCRIPTION ) class CooccurrenceCount(evaluate.Measurement): """This measurement returns the co-occurrence count of two words.""" def _info(self): return evaluate.MeasurementInfo( module_type="measurement", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features({ 'data': datasets.Value('string') }), ) def _download_and_prepare(self, dl_manager): stanza.download('en', processors='tokenize') def _compute(self, data, words): for each in words: word1 = each[0] word2 = each[1] print(word1) print(word2) len1 = len(stanza_tokenizer(word1)) len2 = len(stanza_tokenizer(word2)) if len1 > len2: ugram = len1 lgram = len2 elif len1 < len2: ugram = len2 lgram = len1 else: ugram = len1 lgram = len1 v = CountVectorizer( ngram_range=(lgram, ugram), tokenizer=stanza_tokenizer, lowercase=True ) analyzer = v.build_analyzer() vectorizer = CountVectorizer( ngram_range=(lgram, ugram), vocabulary={ analyzer(word1)[-1]: 0, analyzer(word2)[-1]: 1 }, tokenizer=stanza_tokenizer, lowercase=True ) co_occurrences = vectorizer.fit_transform(data) dense_mat = co_occurrences.todense() count = len(data) co_occurrence_count = np.sum( np.apply_along_axis(check_count, axis=1, arr=dense_mat) ) each.append(count) each.append(co_occurrence_count) return words