Prabin Bhandari commited on
Commit
d303927
1 Parent(s): 4d57961

Update the moduke

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. cooccurrence_count.py +54 -50
  3. requirements.txt +3 -1
  4. tests.py +5 -14
app.py CHANGED
@@ -3,4 +3,4 @@ from evaluate.utils import launch_gradio_widget
3
 
4
 
5
  module = evaluate.load("prb977/cooccurrence_count")
6
- launch_gradio_widget(module)
 
3
 
4
 
5
  module = evaluate.load("prb977/cooccurrence_count")
6
+ launch_gradio_widget(module)
cooccurrence_count.py CHANGED
@@ -1,4 +1,5 @@
1
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
@@ -11,85 +12,88 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
 
 
 
15
 
16
  import evaluate
17
  import datasets
 
 
18
 
19
 
20
- # TODO: Add BibTeX citation
21
- _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
26
- }
27
- """
28
-
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
 
34
 
35
- # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
- Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
49
-
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
  >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 
 
 
58
 
59
 
60
- @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 
 
 
61
  class CooccurrenceCount(evaluate.Measurement):
62
- """TODO: Short description of my evaluation module."""
63
 
64
  def _info(self):
65
- # TODO: Specifies the evaluate.EvaluationModuleInfo object
66
  return evaluate.MeasurementInfo(
67
- # This is the description that will appear on the modules page.
68
  module_type="measurement",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
 
76
  }),
77
- # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
- # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
-
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
- }
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current
2
+ # dataset script contributor.
3
  #
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
+ """
16
+ Get the co-occurance count for two words in each sentece in a dataset.
17
+ """
18
+
19
 
20
  import evaluate
21
  import datasets
22
+ from sklearn.feature_extraction.text import CountVectorizer
23
+ import numpy as np
24
 
25
 
 
 
 
 
 
 
 
 
 
 
26
  _DESCRIPTION = """\
27
+ Returns the co-occurrence count of two words in the input.
28
  """
29
 
30
+ _CITATION = ""
31
 
 
32
  _KWARGS_DESCRIPTION = """
33
+ Calculates the co-occurence of two words in each sentence.
34
  Args:
35
+ `data`: a list of `str` which containes a dataset.
36
+ `word1`: The first word.
37
+ `word2`: The second word.
 
38
  Returns:
39
+ count: The co-occurrence count of word1 and word2 in data.
 
40
  Examples:
41
+ >>> data = ["hello sun","hello moon", "hello sun"]
42
+ >>> c_count = evaluate.load("prb977/cooccurrence_count")
43
+ >>> results = c_count.compute(references=data, word1='hello', word2='sun')
 
 
44
  >>> print(results)
45
+ {'count': 3, 'co_occurrence_count': 2}
46
  """
47
 
48
+
49
+ def check_count(x):
50
+ if x[0].all() <= 0:
51
+ return 0
52
+ return 1
53
 
54
 
55
+ @evaluate.utils.file_utils.add_start_docstrings(
56
+ _DESCRIPTION,
57
+ _KWARGS_DESCRIPTION
58
+ )
59
  class CooccurrenceCount(evaluate.Measurement):
60
+ """This measurement returns the co-occurrence count of two words."""
61
 
62
  def _info(self):
 
63
  return evaluate.MeasurementInfo(
 
64
  module_type="measurement",
65
  description=_DESCRIPTION,
66
  citation=_CITATION,
67
  inputs_description=_KWARGS_DESCRIPTION,
68
  # This defines the format of each prediction and reference
69
  features=datasets.Features({
70
+ 'data': datasets.Value('string'),
71
+ 'word1': datasets.Value('string'),
72
+ 'word2': datasets.Value('string'),
73
  }),
 
 
 
 
 
74
  )
75
 
76
+ def _compute(self, data, word1, word2):
 
 
 
 
 
77
  """Returns the scores"""
78
+ len1 = len(word1.split(' '))
79
+ len2 = len(word2.split(' '))
80
+ gram = len1 if len1 > len2 else len2
81
+ v = CountVectorizer(ngram_range=(gram, gram))
82
+ analyzer = v.build_analyzer()
83
+ vectorizer = CountVectorizer(
84
+ ngram_range=(gram, gram),
85
+ vocabulary={
86
+ analyzer(word1)[0]: 0,
87
+ analyzer(word2)[0]: 1
88
+ }
89
+ )
90
+ co_occurrences = vectorizer.fit_transform(data)
91
+ dense_mat = co_occurrences.todense()
92
+ count = len(dense_mat)
93
+ co_occurrence_count = np.sum(
94
+ np.apply_along_axis(check_count, axis=1, arr=dense_mat)
95
+ )
96
  return {
97
+ "cout": count,
98
+ "co_occurrence_count": co_occurrence_count,
99
+ }
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- git+https://github.com/huggingface/evaluate@main
 
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ sklearn
3
+ numpy
tests.py CHANGED
@@ -1,17 +1,8 @@
1
  test_cases = [
2
  {
3
- "predictions": [0, 0],
4
- "references": [1, 1],
5
- "result": {"metric_score": 0}
 
6
  },
7
- {
8
- "predictions": [1, 1],
9
- "references": [1, 1],
10
- "result": {"metric_score": 1}
11
- },
12
- {
13
- "predictions": [1, 0],
14
- "references": [1, 1],
15
- "result": {"metric_score": 0.5}
16
- }
17
- ]
 
1
  test_cases = [
2
  {
3
+ "data": ["hello sun", "hello moon", "hello sun"],
4
+ "word1": "hello",
5
+ "word2": "sun",
6
+ "result": {"count": 2}
7
  },
8
+ ]