How to use ths model? Download the pytorch_model.bin file and execute the following:

import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer, BertModel, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 128
BATCH_SIZE = 20
text_col_name = 'sentence'
category_col = 'label_text'

#Input should be one dataframe having one column with header as 'sentence' : test_df (do reset_index() if needed)
test_df = pd.DataFrame({"sentence":['We are striving to reduce the amount of waste we produce, and to reduce water as well as paper consumption.']})

def scoring_data_prep(dataset):
    out = []
    target = []
    mask = []
    
    for i in range(len(dataset)):
        rec = dataset[i]
        out.append(rec['ids'].reshape(-1,MAX_LEN))
        mask.append(rec['mask'].reshape(-1,MAX_LEN))

        out_stack = torch.cat(out, dim = 0)
        mask_stack = torch.cat(mask, dim =0 )
        out_stack = out_stack.to(device, dtype = torch.long)
        mask_stack = mask_stack.to(device, dtype = torch.long)

    return out_stack, mask_stack


class Triage(Dataset):
    """
    This is a subclass of torch packages Dataset class. It processes input to create ids, masks and targets required for model training. 
    """

    def __init__(self, dataframe, tokenizer, max_len, text_col_name):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text_col_name = text_col_name
        

    def __getitem__(self, index):
        title = str(self.data[self.text_col_name][index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            
        }

    def __len__(self):
        return self.len
        
class BERTClass(torch.nn.Module):
    def __init__(self, num_class):
        super(BERTClass, self).__init__()
        self.num_class = num_class
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, self.num_class)
        self.history = dict()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
        
def do_predict(model, tokenizer, test_df):
  test_set = Triage(test_df, tokenizer, MAX_LEN, text_col_name)
  test_params = {'batch_size' : BATCH_SIZE, 'shuffle': False, 'num_workers':0}
  test_loader = DataLoader(test_set, **test_params)
  out_stack, mask_stack = scoring_data_prep(dataset = test_set)
  n = 0
  combined_output = []
  model.eval()
  with torch.no_grad():
      while n < test_df.shape[0]:
          output = model(out_stack[n:n+BATCH_SIZE,:],mask_stack[n:n+BATCH_SIZE,:])
          n = n + BATCH_SIZE
          combined_output.append(output)
      combined_output = torch.cat(combined_output, dim = 0)
      preds = torch.argsort(combined_output, axis = 1, descending = True)
  preds = preds.to('cpu')
  actual_predictions = [i[0] for i in preds.tolist()]
  return actual_predictions
  
model_sustain = BERTClass(2)
model_sustain.to(device)
model_sustain.load_state_dict(torch.load('pytorch_model.bin', map_location=device)['model_state_dict'])

tokenizer_sus = BertTokenizer.from_pretrained('roberta-base')
actual_predictions_sus = do_predict(model_sustain, tokenizer_sus, test_df)

test_df['sustainability'] = ['sustainable' if i==0 else 'unsustainable' for i in actual_predictions_read]

Our work can be cited as follows:

@inproceedings{ghosh-2022-finsim-esg,
    title = "Ranking Environment, Social And Governance Related Concepts And Assessing Sustainability Aspect Of Financial Texts",
    author={Ghosh, Sohom and Naskar, Sudip Kumar},
    booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP@IJCAI-ECAI 2022)",
    month = "July" ,
    year = "2022",
    address = "Vienna, Austria",
    publisher = "-",
    url = "https://mx.nthu.edu.tw/~chungchichen/FinNLP2022_IJCAI/14.pdf",
    pages = "87--92",
}
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.