File size: 2,359 Bytes
30a30bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#  Copyright 2021-present, the Recognai S.L. team.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import sys
import time

import argilla as rg
import pandas as pd
import requests
from argilla.labeling.text_classification import Rule, add_rules
from datasets import load_dataset


class LoadDatasets:
    def __init__(self, api_key, workspace="team"):
        rg.init(api_key=api_key, workspace=workspace)


    @staticmethod
    def load_somos():
        print("Loading somos dataset")
        # Leer el dataset del Hub
        dataset = load_dataset("somosnlp/somos-alpaca-es", split="train")
        dataset = dataset.remove_columns("metrics") # si falla se puede comentar esta linea
        records = rg.DatasetForTextClassification.from_datasets(dataset)

        # Log the dataset
        rg.log(
            records,
            name="somos-alpaca-es",
            tags={"description": "SomosNLP Hackathon dataset"},
        )
        settings = rg.TextClassificationSettings(
            label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD"]
        )
        rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team")


if __name__ == "__main__":
    API_KEY = sys.argv[1]
    LOAD_DATASETS = sys.argv[2]

    if LOAD_DATASETS.lower() == "none":
        print("No datasets being loaded")
    else:
        while True:
            try:
                response = requests.get("http://0.0.0.0:6900/")
                if response.status_code == 200:
                    ld = LoadDatasets(API_KEY)

                    ld.load_somos()
                    break

            except requests.exceptions.ConnectionError:
                pass
            except Exception as e:
                print(e)
                time.sleep(10)
                pass

            time.sleep(5)