File size: 10,069 Bytes
2756ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ded1dc
2756ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import re
import ast
import psycopg
from datetime import datetime
import pytz
import streamlit as st
regex_pattern = r'^(\"\d+)[\s\S]*?(\"$)'

DB_HOST = st.secrets["DB_HOST"]
DB_NAME = st.secrets["DB_NAME"]
DB_USER = st.secrets["DB_USER"]
DB_PASS = st.secrets["DB_PASS"]

def create_connection():
    """ create a database connection to the SQLite database
        specified by db_file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = psycopg.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST, sslmode="require")
    except:
        print(e)

    return conn


def check_for_already_present(dict_of_docs):
    conn = psycopg.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASS, host=DB_HOST)
    cur = conn.cursor()

    list_of_docs = list(dict_of_docs.keys())

    try:
        result = []

        for docid in list_of_docs:
            sql_for_check = "SELECT Doc_Id FROM stored_results WHERE Doc_Id = %s"
            cur.execute(sql_for_check, (docid,))
            fetch_result = cur.fetchone()
            if fetch_result:
                result.append(fetch_result[0])  # Append the Doc_Id if found

        # Convert lists to sets for set difference operation
        set_of_docs = set(list_of_docs)
        set_of_results = set(result)

        # Calculate the difference
        list_of_docs_not_present = list(set_of_docs - set_of_results)

        return list_of_docs_not_present

    except psycopg.DatabaseError as error:
        print(f"Database operation failed: {error}")
        return None  # or handle error as needed

    finally:
        cur.close()
        conn.close()


def create_task(conn, task):
    """
    Create or replace a task record in both 'tasks' and 'stored_results' tables.

    :param conn: A psycopg2 database connection object
    :param task: A tuple containing the task data (Doc_ID, Title, Doc_Text, Doc_Blockquotes, Doc_Size)
    :return: None
    """
    sql_task = '''
    INSERT INTO tasks(Doc_ID, Title, Doc_Text, Doc_Blockquotes, Doc_Size)
    VALUES (%s, %s, %s, %s, %s)
    ON CONFLICT (Doc_ID) DO UPDATE SET
        Title = EXCLUDED.Title,
        Doc_Text = EXCLUDED.Doc_Text,
        Doc_Blockquotes = EXCLUDED.Doc_Blockquotes,
        Doc_Size = EXCLUDED.Doc_Size;
    '''

    sql_results = '''
    INSERT INTO stored_results(Doc_ID, Title, Doc_Text, Doc_Blockquotes, Doc_Size)
    VALUES (%s, %s, %s, %s, %s)
    ON CONFLICT (Doc_ID) DO UPDATE SET
        Title = EXCLUDED.Title,
        Doc_Text = EXCLUDED.Doc_Text,
        Doc_Blockquotes = EXCLUDED.Doc_Blockquotes,
        Doc_Size = EXCLUDED.Doc_Size;
    '''

    try:
        # Using a context manager to handle the cursor
        with conn.cursor() as cur:
            cur.execute(sql_task, task)
            cur.execute(sql_results, task)
            conn.commit()
    except Exception as e:
        print(f"An error occurred: {e}")
        conn.rollback()  # Rollback on error
    finally:
        # No need to explicitly close the cursor due to the context manager
        print("Database operation at create_task() completed.")


def retrieve_text(conn, query):
    """
    Create a new task
    :param conn:
    :param task:
    :return:
    """
    
    sql_query = "SELECT * FROM tasks"

    # Execute the query to fetch rows from the table
    cursor = conn.cursor()
    cursor.execute(sql_query)

    # Fetch all rows from the cursor
    rows = cursor.fetchall()

        # List to store JSON objects of rows with matching columns
    matching_rows_json = []

    print ("Checking for matching rows")

    for row in rows:
        has_matching_column = False
        has_matching_text = False
        has_matching_indent = False
        matching_columns = {}
        title = None
        doc_id = None
        matching_text = []
        matching_text_with_query = []
        has_matching_indent = False
        matching_indents = []

        for i, column_value in enumerate(row): #generates pairs of (index, column_value) for each cell in the row 
           
            
            if isinstance(column_value, str) and re.search(regex_pattern, column_value, re.MULTILINE | re.DOTALL):
                matching_text = []
                matching_text_with_query = []
                has_matching_text = True
                #matching_text_with_query= find_matching_text_with_query(column_value, query)
                matching_text = find_matching_text(column_value)
                
            if i == 1:  # Replace title_column_index with the index of the "Title" column
                title = column_value
            elif i == 0:  # Replace doc_id_column_index with the index of the "DocID" column
                doc_id = column_value
                
            elif i == 3:
                matching_indent_list=ast.literal_eval(column_value)
                #matching_indent_list = column_value
                #print("Matching indent list/ list of blockquotes for", title, "is", matching_indent_list)
                if len(matching_indent_list) == 0:
                    has_matching_indent = False
                    #print(title, "has no blockquote")
                else:
                    has_matching_indent = True
                    matching_indents = [value for value in matching_indent_list]
                    #print("Indents for", title, "is", matching_indents)
        if has_matching_text or has_matching_indent:
            row_data = {
                "Title": title,
                "DocID": doc_id,
                "matching_columns": matching_text, #+ matching_text_query, 
                "matching_indents": matching_indents
            }
            matching_rows_json.append(row_data)

    data_dict = matching_rows_json
    # Convert the list of JSON objects to a JSON array
    #json_result = json.dumps(matching_rows_json, indent=2)
    '''
    df = pd.read_json(json_result)

    data_dict = df.to_dict(orient='list')
    '''
    with open("Matching_rows_Format.txt", "w") as file:
        file.write(str(data_dict))

    print("Generated matching rows and file with matching rows")
    cursor.close()
    return data_dict


def add_stored_results(conn, lst):
    """
    Copies selected documents from 'stored_results' to 'tasks' based on document IDs provided.

    :param conn: Database connection object.
    :param lst: List of document IDs to transfer.
    :return: Number of records successfully inserted or None if an error occurred.
    """
    sql = '''INSERT INTO tasks (Doc_Id, Title, Doc_Text, Doc_Blockquotes, Doc_Size)
             SELECT Doc_Id, Title, Doc_Text, Doc_Blockquotes, Doc_Size
             FROM stored_results
             WHERE Doc_Id = %s;'''

    try:
        with conn.cursor() as cur:
            for docid in lst:
                cur.execute(sql, (docid,))
            conn.commit()
            print("Stored data transferred to tasks.")
            return cur.rowcount  # Returns the total number of rows affected by the last execute call
    except Exception as e:
        print(f"An error occurred: {e}")
        conn.rollback()  # Rollback on error
        return None
    finally:
        print("Operation - adding stored data completed.")


# Example Usage
# Assume 'conn' is a psycopg2 connection object
# document_ids = [123, 456, 789]
# result = add_stored_results(conn, document_ids)
# if result is not None:
#     print(f"Transferred {result} records.")


def find_matching_text_with_query(column_value, query):
    matches = re.finditer(regex_pattern, column_value, re.MULTILINE | re.DOTALL)
    matching_text_with_query = []
    for match in matches:
        if query in match.group():
            matching_text_with_query.append(query)
    print("Matching text with query is: ", matching_text_with_query)
    return matching_text_with_query

def find_matching_text(column_value):
    matching_text = []
    matches = re.finditer(regex_pattern, column_value, re.MULTILINE | re.DOTALL)
    for match in matches:
        matching_text.append(match.group())
    print("Matching text is: ", matching_text)
    return matching_text

def delete_sql_records(conn):
    delete_records = "DELETE FROM tasks"

    cur=conn.cursor()
    cur.execute(delete_records)

    print("Deleted records")

def add_classified_results(dict_of_results, searchquery):
    conn = create_connection()

    # SQL query for classified_index table
    sql_query_classified = '''INSERT INTO classified_index(Doc_Id, Title, searchquery, matching_indents, matching_columns, matching_columns_after_classification, matching_indents_after_classification)
                   VALUES (%s, %s, %s, %s, %s, %s, %s)'''

    # SQL query for another_table
    sql_query_search_queries = '''INSERT INTO search_queries(searchquery, dateandtime)
                                 VALUES (%s, %s)'''

    # Get the current date and time in IST
    ist = pytz.timezone('Asia/Kolkata')
    current_datetime_ist = datetime.now(ist).strftime('%Y-%m-%d %H:%M:%S')

    with conn:
        cur = conn.cursor()

        for result in dict_of_results:
            cur.execute(sql_query_classified, (result['DocID'], result['Title'], searchquery, result['matching_columns'], result['matching_indents'], result['matching_columns_after_classification'], result['matching_indents_after_classification']))

        # Insert into another_table
        cur.execute(sql_query_search_queries, (searchquery, current_datetime_ist))

        conn.commit()


def main(list_of_docs_already_present, lst_new_data, query): #lst, query to be added as parameters
    conn = create_connection()
    with conn:
        # create tasks from a list
        delete_sql_records(conn)
        # Extracting values from each subdictionary
        values_list = [list(subdict.values()) for subdict in lst_new_data.values()]
        add_stored_results(conn, list_of_docs_already_present)
        for task in values_list:
            create_task(conn, task)

        results=retrieve_text(conn, query)

      
    return results
        


if __name__ == '__main__':
    main()