File size: 3,794 Bytes
d303927
 
4d57961
 
 
 
 
 
 
 
 
 
 
 
d303927
 
 
 
4d57961
 
 
d303927
 
1ea30cb
4d57961
 
 
d303927
4d57961
 
d303927
4d57961
 
d303927
4d57961
d303927
31cd311
4d57961
 
d303927
 
31cd311
4d57961
31cd311
4d57961
 
d303927
 
 
 
 
4d57961
 
1ea30cb
 
 
 
 
 
 
 
 
 
 
 
d303927
 
 
 
4d57961
d303927
4d57961
 
 
 
 
 
 
 
075d829
4d57961
 
 
1ea30cb
 
 
31cd311
 
 
 
1861797
 
31cd311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Copyright 2020 The HuggingFace Datasets Authors and the current
# dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Get the co-occurance count for two words in each sentece in a dataset.
"""


import evaluate
import datasets
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import stanza


_DESCRIPTION = """\
Returns the co-occurrence count of two words in the input.
"""

_CITATION = ""

_KWARGS_DESCRIPTION = """
Calculates the co-occurence of two words in each sentence.
Args:
    `data`: a list of `str` which containes a dataset.
    `words`: list of list of two words that we want to check for
Returns:
Examples:
    >>> data = ["hello sun","hello moon", "hello sun"]
    >>> c_count = evaluate.load("prb977/cooccurrence_count")
    >>> results = c_count.compute(data=data, words=[['hello','sun']\)
    >>> print(results)
    [['hello','sun',3,2]]
"""


def check_count(x):
    if x[0].all() <= 0:
        return 0
    return 1


nlp = stanza.Pipeline(lang='en', processors='tokenize')


def stanza_tokenizer(sen):
    doc = nlp(sen)
    tokens = []
    for sen in doc.sentences:
        for token in sen.tokens:
            tokens.append(token.text)
    return tokens


@evaluate.utils.file_utils.add_start_docstrings(
    _DESCRIPTION,
    _KWARGS_DESCRIPTION
)
class CooccurrenceCount(evaluate.Measurement):
    """This measurement returns the co-occurrence count of two words."""

    def _info(self):
        return evaluate.MeasurementInfo(
            module_type="measurement",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features({
                'data': datasets.Value('string')
            }),
        )

    def _download_and_prepare(self, dl_manager):
        stanza.download('en', processors='tokenize')

    def _compute(self, data, words):
        for each in words:
            word1 = each[0]
            word2 = each[1]
            print(word1)
            print(word2)
            len1 = len(stanza_tokenizer(word1))
            len2 = len(stanza_tokenizer(word2))
            if len1 > len2:
                ugram = len1
                lgram = len2
            elif len1 < len2:
                ugram = len2
                lgram = len1
            else:
                ugram = len1
                lgram = len1

            v = CountVectorizer(
                ngram_range=(lgram, ugram),
                tokenizer=stanza_tokenizer,
                lowercase=True
            )
            analyzer = v.build_analyzer()
            vectorizer = CountVectorizer(
                ngram_range=(lgram, ugram),
                vocabulary={
                    analyzer(word1)[-1]: 0,
                    analyzer(word2)[-1]: 1
                },
                tokenizer=stanza_tokenizer,
                lowercase=True
            )
            co_occurrences = vectorizer.fit_transform(data)
            dense_mat = co_occurrences.todense()
            count = len(data)
            co_occurrence_count = np.sum(
                np.apply_along_axis(check_count, axis=1, arr=dense_mat)
            )
            each.append(count)
            each.append(co_occurrence_count)
        return words