File size: 2,867 Bytes
ac6138f
 
 
 
 
 
 
 
 
afff708
ac6138f
afff708
 
ac6138f
 
afff708
ac6138f
 
 
 
 
 
 
 
 
 
500010a
 
 
ac6138f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afff708
 
 
 
 
 
 
 
 
 
 
 
 
 
ac6138f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import json

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def update_filters(filters, data):
    for key, value in filters.items():
        dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity']
        if data[key] and key == 'rating':
            if data[key] not in dont_add:
                value.add(data[key])
        else:
            for val in data[key]:
                if val and val not in dont_add:
                    value.add(val)
    return filters

def clean_filters(filters):
    for key, val in filters.items():
        val.add('ALL')
        filters[key] = list(val)
    return filters

if __name__ == '__main__':

    print('Embedding Started')

    filters = {
    'genres': set(),
    'themes': set(),
    'rating': set()
    }

    embeddings = {}
    for name in os.listdir('./anime'):
        with open(f"./anime/{name}", 'r') as file:
            data = json.load(file)

        if not data:
            continue

        filters = update_filters(filters, data)

        name = name.replace('.json', '')
        
        data['image'] = f"./images/{name}.jpg"

        text = f'''
                This anime has {data['episodes']} Episodes |
                This anime premiered on {data['premiered']} |
                This anime was broadcasted on: {data['broadcast']} |
                This anime was produced by {' '.join(data['producers'])} |
                This anime was licensed by Licensors: {' '.join(data['licensors'])} |
                The studios in charge of this anime was {' '.join(data['studios'])} | 
                The source of this anime was {' '.join(data['source'])} |  
                The genres of this anime are {' '.join(data['genres'])} | 
                The themes of this anime are {' '.join(data['themes'])} | 
                The demographic of this anime is {data['demographic']} | 
                The duration of this anime is {data['duration']} | 
                The rating of this anime is {data['rating']} | 
                The description of this anime is {data['description']}'''
        
        embeddings[name] = data.copy()
        
        embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]
        subjective_embeddings = []
        for review in embeddings[name]['reviews']:
            text = review['text']
            subjective_embeddings.append(model.encode(text).tolist())
            data['review'] = text
        embeddings[name]['subjective_embeddings'] = subjective_embeddings

    filters = clean_filters(filters)

    with open('./embeddings/data.json', 'w') as f:
        json.dump({'embeddings':embeddings, 'filters': filters}, f)

    print('Embedding Complete')