Spaces:
Runtime error
Runtime error
File size: 3,794 Bytes
d303927 4d57961 d303927 4d57961 d303927 1ea30cb 4d57961 d303927 4d57961 d303927 4d57961 d303927 4d57961 d303927 31cd311 4d57961 d303927 31cd311 4d57961 31cd311 4d57961 d303927 4d57961 1ea30cb d303927 4d57961 d303927 4d57961 075d829 4d57961 1ea30cb 31cd311 1861797 31cd311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# Copyright 2020 The HuggingFace Datasets Authors and the current
# dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Get the co-occurance count for two words in each sentece in a dataset.
"""
import evaluate
import datasets
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import stanza
_DESCRIPTION = """\
Returns the co-occurrence count of two words in the input.
"""
_CITATION = ""
_KWARGS_DESCRIPTION = """
Calculates the co-occurence of two words in each sentence.
Args:
`data`: a list of `str` which containes a dataset.
`words`: list of list of two words that we want to check for
Returns:
Examples:
>>> data = ["hello sun","hello moon", "hello sun"]
>>> c_count = evaluate.load("prb977/cooccurrence_count")
>>> results = c_count.compute(data=data, words=[['hello','sun']\)
>>> print(results)
[['hello','sun',3,2]]
"""
def check_count(x):
if x[0].all() <= 0:
return 0
return 1
nlp = stanza.Pipeline(lang='en', processors='tokenize')
def stanza_tokenizer(sen):
doc = nlp(sen)
tokens = []
for sen in doc.sentences:
for token in sen.tokens:
tokens.append(token.text)
return tokens
@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION,
_KWARGS_DESCRIPTION
)
class CooccurrenceCount(evaluate.Measurement):
"""This measurement returns the co-occurrence count of two words."""
def _info(self):
return evaluate.MeasurementInfo(
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
'data': datasets.Value('string')
}),
)
def _download_and_prepare(self, dl_manager):
stanza.download('en', processors='tokenize')
def _compute(self, data, words):
for each in words:
word1 = each[0]
word2 = each[1]
print(word1)
print(word2)
len1 = len(stanza_tokenizer(word1))
len2 = len(stanza_tokenizer(word2))
if len1 > len2:
ugram = len1
lgram = len2
elif len1 < len2:
ugram = len2
lgram = len1
else:
ugram = len1
lgram = len1
v = CountVectorizer(
ngram_range=(lgram, ugram),
tokenizer=stanza_tokenizer,
lowercase=True
)
analyzer = v.build_analyzer()
vectorizer = CountVectorizer(
ngram_range=(lgram, ugram),
vocabulary={
analyzer(word1)[-1]: 0,
analyzer(word2)[-1]: 1
},
tokenizer=stanza_tokenizer,
lowercase=True
)
co_occurrences = vectorizer.fit_transform(data)
dense_mat = co_occurrences.todense()
count = len(data)
co_occurrence_count = np.sum(
np.apply_along_axis(check_count, axis=1, arr=dense_mat)
)
each.append(count)
each.append(co_occurrence_count)
return words
|