Spaces:
Running
Running
hkhwilwh
commited on
Commit
·
b00edcf
1
Parent(s):
fcefbe5
Add files with Git LFS
Browse files- .devcontainer/devcontainer.json +33 -0
- .gitignore +8 -0
- __pycache__/agents.cpython-310.pyc +0 -0
- __pycache__/agents.cpython-312.pyc +0 -0
- __pycache__/config.cpython-310.pyc +0 -0
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/document_exporter.cpython-310.pyc +0 -0
- __pycache__/pdf_processor.cpython-310.pyc +0 -0
- __pycache__/translator.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-312.pyc +0 -0
- agents.py +93 -0
- app.py +329 -0
- changes.tar.gz +3 -0
- config.py +31 -0
- document_exporter.py +125 -0
- fonts/NotoNaskhArabic-Regular.ttf +0 -0
- fonts/NotoNaskhArabic.zip +3 -0
- hassan_github +49 -0
- hassan_github.pub +1 -0
- pdf_processor.py +318 -0
- requirements.txt +27 -0
- style.css +51 -0
- translator.py +222 -0
- utils.py +51 -0
.devcontainer/devcontainer.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "Python 3",
|
3 |
+
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
4 |
+
"image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
|
5 |
+
"customizations": {
|
6 |
+
"codespaces": {
|
7 |
+
"openFiles": [
|
8 |
+
"README.md",
|
9 |
+
"app.py"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
"vscode": {
|
13 |
+
"settings": {},
|
14 |
+
"extensions": [
|
15 |
+
"ms-python.python",
|
16 |
+
"ms-python.vscode-pylance"
|
17 |
+
]
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
|
21 |
+
"postAttachCommand": {
|
22 |
+
"server": "streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false"
|
23 |
+
},
|
24 |
+
"portsAttributes": {
|
25 |
+
"8501": {
|
26 |
+
"label": "Application",
|
27 |
+
"onAutoForward": "openPreview"
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"forwardPorts": [
|
31 |
+
8501
|
32 |
+
]
|
33 |
+
}
|
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment files
|
2 |
+
.env
|
3 |
+
|
4 |
+
# Python virtual environment
|
5 |
+
venv/
|
6 |
+
|
7 |
+
# Logs
|
8 |
+
*.log
|
__pycache__/agents.cpython-310.pyc
ADDED
Binary file (4.21 kB). View file
|
|
__pycache__/agents.cpython-312.pyc
ADDED
Binary file (4.62 kB). View file
|
|
__pycache__/config.cpython-310.pyc
ADDED
Binary file (939 Bytes). View file
|
|
__pycache__/config.cpython-312.pyc
ADDED
Binary file (1.01 kB). View file
|
|
__pycache__/document_exporter.cpython-310.pyc
ADDED
Binary file (2.94 kB). View file
|
|
__pycache__/pdf_processor.cpython-310.pyc
ADDED
Binary file (5.41 kB). View file
|
|
__pycache__/translator.cpython-310.pyc
ADDED
Binary file (3.32 kB). View file
|
|
__pycache__/utils.cpython-310.pyc
ADDED
Binary file (2.29 kB). View file
|
|
__pycache__/utils.cpython-312.pyc
ADDED
Binary file (2.85 kB). View file
|
|
agents.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from crewai import Agent
|
2 |
+
from langchain.tools import Tool
|
3 |
+
from utils import create_uae_legal_tools, is_arabic
|
4 |
+
from config import LEGAL_CATEGORIES
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Load environment variables
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# Validate API key
|
12 |
+
if not os.getenv('OPENAI_API_KEY'):
|
13 |
+
raise ValueError("OpenAI API key not found. Please set OPENAI_API_KEY in your environment variables.")
|
14 |
+
|
15 |
+
# Common LLM configuration
|
16 |
+
BASE_LLM_CONFIG = {
|
17 |
+
"config_list": [
|
18 |
+
{
|
19 |
+
"model": "gpt-4-1106-preview", # Using the latest GPT-4 Turbo model
|
20 |
+
"api_key": os.getenv('OPENAI_API_KEY'),
|
21 |
+
"temperature": 0.3, # Lower temperature for more consistent outputs
|
22 |
+
"max_tokens": 4000,
|
23 |
+
"presence_penalty": 0.0,
|
24 |
+
"frequency_penalty": 0.0,
|
25 |
+
"response_format": {"type": "text"}
|
26 |
+
}
|
27 |
+
]
|
28 |
+
}
|
29 |
+
|
30 |
+
# Configuration for summarization tasks
|
31 |
+
SUMMARY_LLM_CONFIG = {
|
32 |
+
"config_list": [
|
33 |
+
{
|
34 |
+
"model": "gpt-4-1106-preview",
|
35 |
+
"api_key": os.getenv('OPENAI_API_KEY'),
|
36 |
+
"temperature": 0.2, # Even lower temperature for summaries
|
37 |
+
"max_tokens": 4000,
|
38 |
+
"presence_penalty": 0.0,
|
39 |
+
"frequency_penalty": 0.3, # Reduce repetition in summaries
|
40 |
+
"response_format": {"type": "text"}
|
41 |
+
}
|
42 |
+
]
|
43 |
+
}
|
44 |
+
|
45 |
+
def create_judge_agent():
|
46 |
+
return Agent(
|
47 |
+
role='قاضي قانوني إماراتي',
|
48 |
+
goal='تقديم أحكام وتفسيرات قانونية دقيقة بناءً على القانون الإماراتي',
|
49 |
+
backstory="""
|
50 |
+
أنت قاضٍ متمرس في النظام القانوني الإماراتي مع خبرة تزيد عن 20 عاماً
|
51 |
+
ومعرفة عميقة بالقوانين واللوائح والسوابق القانونية الإماراتية.
|
52 |
+
دورك هو تحليل القضايا وتقديم أحكام عادلة ومسببة بناءً على القانون الإماراتي،
|
53 |
+
مع التركيز على تطبيق أحدث التشريعات والأحكام القضائية.
|
54 |
+
يمنع الرد على اي استفسار غير قانوني او خاص بغير المواضيع القانونية في دولة الامارات العربية المتحدة.
|
55 |
+
يرجى التأكد من أن جميع الردود على أسئلتي تستند إلى مصادر موثوقة، مع تضمين الاستشهادات والروابط المناسبة لتلك المصادر. أفضل الإجابات التفصيلية والمنظمة جيدًا والتي لا تعالج استفساري فحسب، بل توفر أيضًا سياقًا أو رؤى إضافية عند الاقتضاء. كن واضحًا وموجزًا، وإذا كان موضوع معين لا يؤثر بشكل مباشر على أهدافي أو دراستي، فيرجى إبلاغي بذلك. اذكر أيضًا المراجع في نهاية المقال
|
56 |
+
""",
|
57 |
+
verbose=True,
|
58 |
+
allow_delegation=False,
|
59 |
+
llm_config=BASE_LLM_CONFIG,
|
60 |
+
tools=create_uae_legal_tools()
|
61 |
+
)
|
62 |
+
|
63 |
+
def create_advocate_agent():
|
64 |
+
return Agent(
|
65 |
+
role='محامي إماراتي',
|
66 |
+
goal='تقديم التمثيل القانوني والمشورة المتخصصة بناءً على القانون الإماراتي',
|
67 |
+
backstory="""
|
68 |
+
أنت محامٍ ماهر في الإمارات العربية المتحدة مع خبرة 15 عاماً في مختلف
|
69 |
+
مجالات القانون الإماراتي. تخصصت في قضايا المحاكم الاتحادية والمحلية،
|
70 |
+
ولديك سجل حافل في تمثيل العملاء بنجاح. دورك هو تقديم المشورة القانونية
|
71 |
+
الدقيقة وضمان حماية حقوق العملاء وفقاً للقانون الإماراتي.
|
72 |
+
""",
|
73 |
+
verbose=True,
|
74 |
+
allow_delegation=False,
|
75 |
+
llm_config=BASE_LLM_CONFIG,
|
76 |
+
tools=create_uae_legal_tools()
|
77 |
+
)
|
78 |
+
|
79 |
+
def create_consultant_agent():
|
80 |
+
return Agent(
|
81 |
+
role='مستشار قضائي إماراتي',
|
82 |
+
goal='تقديم الاستشارات والتوجيه القانوني المتخصص في القانون الإماراتي',
|
83 |
+
backstory="""
|
84 |
+
أنت مستشار قضائي متمرس مع خبرة 18 عاماً ومعرفة شاملة بالنظام القانوني
|
85 |
+
والإجراءات القضائية في الإمارات العربية المتحدة. تخصصت في تقديم الاستشارات
|
86 |
+
للمؤسسات والأفراد، مع التركيز على الحلول العملية والوقائية. دورك هو تقديم
|
87 |
+
التوجيه الاستراتيجي والمشورة المتخصصة في المسائل القانونية المعقدة.
|
88 |
+
""",
|
89 |
+
verbose=True,
|
90 |
+
allow_delegation=False,
|
91 |
+
llm_config=BASE_LLM_CONFIG,
|
92 |
+
tools=create_uae_legal_tools()
|
93 |
+
)
|
app.py
ADDED
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from agents import create_judge_agent, create_advocate_agent, create_consultant_agent
|
3 |
+
from crewai import Task, Crew
|
4 |
+
from utils import is_arabic, format_legal_response
|
5 |
+
from config import LEGAL_CATEGORIES, DEFAULT_LANGUAGE
|
6 |
+
|
7 |
+
st.set_page_config(page_title="المساعد القانوني الإماراتي", layout="wide")
|
8 |
+
|
9 |
+
# Load custom CSS
|
10 |
+
with open('style.css') as f:
|
11 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
12 |
+
|
13 |
+
st.title("المساعد القانوني الإماراتي")
|
14 |
+
st.write("احصل على المساعدة القانونية من خبراء قانونيين إماراتيين مدعومين بالذكاء الاصطناعي")
|
15 |
+
|
16 |
+
# Add imports
|
17 |
+
from pdf_processor import PDFProcessor
|
18 |
+
from document_exporter import DocumentExporter
|
19 |
+
from translator import Translator
|
20 |
+
|
21 |
+
# Initialize components
|
22 |
+
if 'pdf_processor' not in st.session_state:
|
23 |
+
st.session_state.pdf_processor = PDFProcessor()
|
24 |
+
if 'document_exporter' not in st.session_state:
|
25 |
+
st.session_state.document_exporter = DocumentExporter()
|
26 |
+
if 'translator' not in st.session_state:
|
27 |
+
st.session_state.translator = Translator()
|
28 |
+
|
29 |
+
# Create a new tab for PDF upload
|
30 |
+
tab1, tab2, tab3, tab4 = st.tabs(["تحليل المستندات", "القاضي", "المحامي", "المستشار"])
|
31 |
+
|
32 |
+
# PDF Upload Tab
|
33 |
+
with tab1:
|
34 |
+
st.header("تحليل المستندات القانونية")
|
35 |
+
|
36 |
+
# Add service selection toggle
|
37 |
+
service_type = st.radio(
|
38 |
+
"اختر نوع الخدمة / Select Service",
|
39 |
+
["تلخيص وتحليل المستند", "ترجمة المستند"],
|
40 |
+
horizontal=True
|
41 |
+
)
|
42 |
+
|
43 |
+
if service_type == "ترجمة المستند":
|
44 |
+
target_language = st.selectbox(
|
45 |
+
"اختر لغة الترجمة / Select Target Language",
|
46 |
+
["العربية", "English", "中文", "हिंदी", "اردو"],
|
47 |
+
index=1
|
48 |
+
)
|
49 |
+
|
50 |
+
uploaded_file = st.file_uploader("قم بتحميل ملف PDF للتحليل", type=['pdf'])
|
51 |
+
|
52 |
+
if uploaded_file is not None:
|
53 |
+
# Check file size
|
54 |
+
file_size = len(uploaded_file.getvalue()) / (1024 * 1024) # Convert to MB
|
55 |
+
if file_size > 20: # 20MB limit
|
56 |
+
st.error("حجم الملف كبير جداً. الحد الأقصى المسموح به هو 20 ميجابايت.")
|
57 |
+
st.stop()
|
58 |
+
|
59 |
+
if service_type == "تلخيص وتحليل المستند":
|
60 |
+
# Create progress bar
|
61 |
+
progress_bar = st.progress(0)
|
62 |
+
status_text = st.empty()
|
63 |
+
|
64 |
+
def update_progress(message, progress):
|
65 |
+
status_text.text(message)
|
66 |
+
progress_bar.progress(progress)
|
67 |
+
|
68 |
+
st.session_state.pdf_processor.set_progress_callback(update_progress)
|
69 |
+
|
70 |
+
try:
|
71 |
+
# Process the uploaded PDF
|
72 |
+
results = st.session_state.pdf_processor.process_document(uploaded_file.read())
|
73 |
+
|
74 |
+
# Display results in collapsible sections
|
75 |
+
with st.expander("ملخص المستند", expanded=True):
|
76 |
+
st.write(results["summary"])
|
77 |
+
|
78 |
+
with st.expander("تحليل المخالفات القانونية", expanded=True):
|
79 |
+
st.markdown(results["legal_analysis"], unsafe_allow_html=True)
|
80 |
+
|
81 |
+
with st.expander("الخريطة التشريعية", expanded=True):
|
82 |
+
st.markdown(results["legislation_mapping"], unsafe_allow_html=True)
|
83 |
+
|
84 |
+
# Add export buttons in a container
|
85 |
+
st.markdown("### تحميل التحليل")
|
86 |
+
export_container = st.container()
|
87 |
+
|
88 |
+
col1, col2 = export_container.columns(2)
|
89 |
+
with col1:
|
90 |
+
pdf_button = st.download_button(
|
91 |
+
label="تحميل كملف PDF",
|
92 |
+
data=st.session_state.document_exporter.export_to_pdf(results),
|
93 |
+
file_name="legal_analysis.pdf",
|
94 |
+
mime="application/pdf",
|
95 |
+
key="pdf_download"
|
96 |
+
)
|
97 |
+
|
98 |
+
with col2:
|
99 |
+
word_button = st.download_button(
|
100 |
+
label="تحميل كملف Word",
|
101 |
+
data=st.session_state.document_exporter.export_to_word(results),
|
102 |
+
file_name="legal_analysis.docx",
|
103 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
104 |
+
key="word_download"
|
105 |
+
)
|
106 |
+
|
107 |
+
except ValueError as ve:
|
108 |
+
st.error(f"خطأ في المدخلات: {str(ve)}")
|
109 |
+
except Exception as e:
|
110 |
+
st.error(f"حدث خطأ غير متوقع: {str(e)}")
|
111 |
+
st.error("يرجى المحاولة مرة أخرى أو الاتصال بالدعم الفني")
|
112 |
+
finally:
|
113 |
+
# Clear progress bar and status
|
114 |
+
progress_bar.empty()
|
115 |
+
status_text.empty()
|
116 |
+
|
117 |
+
else: # Translation service
|
118 |
+
with st.spinner("جاري تحليل المستند..."):
|
119 |
+
try:
|
120 |
+
# Extract text from PDF
|
121 |
+
text = st.session_state.pdf_processor.extract_text_from_pdf(uploaded_file.read())
|
122 |
+
|
123 |
+
if not text.strip():
|
124 |
+
st.error("لم يتم العثور على نص قابل للقراءة في المستند")
|
125 |
+
st.stop()
|
126 |
+
|
127 |
+
# Detect source language
|
128 |
+
source_lang = st.session_state.translator.detect_language(text)
|
129 |
+
st.info(f"تم اكتشاف لغة المستند: {st.session_state.translator.get_language_name(source_lang)}")
|
130 |
+
|
131 |
+
# Map language names to codes
|
132 |
+
lang_map = {
|
133 |
+
"العربية": "arabic",
|
134 |
+
"English": "english",
|
135 |
+
"中文": "chinese",
|
136 |
+
"हिंदी": "hindi",
|
137 |
+
"اردو": "urdu"
|
138 |
+
}
|
139 |
+
|
140 |
+
target_lang = lang_map[target_language]
|
141 |
+
|
142 |
+
# Check if source and target are the same
|
143 |
+
if source_lang == target_lang:
|
144 |
+
st.warning("لغة المصدر ولغة الهدف متطابقتان. يرجى اختيار لغة مختلفة للترجمة.")
|
145 |
+
st.stop()
|
146 |
+
|
147 |
+
with st.spinner("جاري الترجمة..."):
|
148 |
+
# Preprocess and translate the text
|
149 |
+
processed_text = st.session_state.translator.preprocess_text(text)
|
150 |
+
translated_text = st.session_state.translator.translate(
|
151 |
+
processed_text,
|
152 |
+
source_lang,
|
153 |
+
target_lang
|
154 |
+
)
|
155 |
+
|
156 |
+
# Display results
|
157 |
+
col1, col2 = st.columns(2)
|
158 |
+
with col1:
|
159 |
+
st.subheader("النص الأصلي / Original Text")
|
160 |
+
st.text_area("", value=text, height=300, key="original_text")
|
161 |
+
|
162 |
+
with col2:
|
163 |
+
st.subheader("النص المترجم / Translated Text")
|
164 |
+
st.text_area("", value=translated_text, height=300, key="translated_text")
|
165 |
+
|
166 |
+
# Add download buttons
|
167 |
+
st.markdown("### تحميل الترجمة")
|
168 |
+
download_col1, download_col2 = st.columns(2)
|
169 |
+
|
170 |
+
with download_col1:
|
171 |
+
st.download_button(
|
172 |
+
label="تحميل النص المترجم",
|
173 |
+
data=translated_text.encode(),
|
174 |
+
file_name=f"translated_document.txt",
|
175 |
+
mime="text/plain",
|
176 |
+
key="translation_download"
|
177 |
+
)
|
178 |
+
|
179 |
+
with download_col2:
|
180 |
+
# Create a simple HTML file with both texts
|
181 |
+
html_content = f"""
|
182 |
+
<html dir="auto">
|
183 |
+
<head>
|
184 |
+
<meta charset="UTF-8">
|
185 |
+
<style>
|
186 |
+
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
187 |
+
.text-container {{ margin-bottom: 20px; }}
|
188 |
+
h2 {{ color: #2c3e50; }}
|
189 |
+
</style>
|
190 |
+
</head>
|
191 |
+
<body>
|
192 |
+
<div class="text-container">
|
193 |
+
<h2>Original Text</h2>
|
194 |
+
<p>{text}</p>
|
195 |
+
</div>
|
196 |
+
<div class="text-container">
|
197 |
+
<h2>Translated Text</h2>
|
198 |
+
<p>{translated_text}</p>
|
199 |
+
</div>
|
200 |
+
</body>
|
201 |
+
</html>
|
202 |
+
"""
|
203 |
+
|
204 |
+
st.download_button(
|
205 |
+
label="تحميل النصين معاً (HTML)",
|
206 |
+
data=html_content.encode(),
|
207 |
+
file_name="translation_with_original.html",
|
208 |
+
mime="text/html",
|
209 |
+
key="html_download"
|
210 |
+
)
|
211 |
+
|
212 |
+
except ValueError as ve:
|
213 |
+
st.error(f"خطأ في المدخلات: {str(ve)}")
|
214 |
+
except Exception as e:
|
215 |
+
st.error(f"حدث خطأ غير متوقع: {str(e)}")
|
216 |
+
st.error("يرجى المحاولة مرة أخرى أو الاتصال بالدعم الفني")
|
217 |
+
|
218 |
+
# Language selector
|
219 |
+
language = st.sidebar.selectbox(
|
220 |
+
"اختر اللغة / Select Language",
|
221 |
+
["العربية", "English"],
|
222 |
+
index=0
|
223 |
+
)
|
224 |
+
|
225 |
+
# Legal category selector
|
226 |
+
selected_category = st.sidebar.selectbox(
|
227 |
+
"اختر الفئة القانونية / Select Legal Category",
|
228 |
+
list(LEGAL_CATEGORIES.values()),
|
229 |
+
index=0
|
230 |
+
)
|
231 |
+
|
232 |
+
# Initialize session state for chat history
|
233 |
+
if 'chat_history' not in st.session_state:
|
234 |
+
st.session_state.chat_history = []
|
235 |
+
|
236 |
+
# Create tabs for different agents
|
237 |
+
tab1, tab2, tab3 = st.tabs(["القاضي", "المحامي", "المستشار"])
|
238 |
+
|
239 |
+
def get_agent_response(agent, query, category):
|
240 |
+
# Prepare the task with context
|
241 |
+
task_description = f"""
|
242 |
+
تحليل والرد على الاستفسار التالي في مجال {category}:
|
243 |
+
{query}
|
244 |
+
|
245 |
+
يجب أن يكون الرد:
|
246 |
+
1. مستنداً إلى القانون الإماراتي
|
247 |
+
2. مدعوماً بالمراجع القانونية
|
248 |
+
3. واضحاً ومفهوماً
|
249 |
+
4. متوافقاً مع أحدث التشريعات
|
250 |
+
"""
|
251 |
+
|
252 |
+
task = Task(
|
253 |
+
description=task_description,
|
254 |
+
agent=agent,
|
255 |
+
expected_output="تحليل قانوني ورد بناءً على القانون الإماراتي"
|
256 |
+
|
257 |
+
)
|
258 |
+
|
259 |
+
crew = Crew(
|
260 |
+
agents=[agent],
|
261 |
+
tasks=[task]
|
262 |
+
)
|
263 |
+
|
264 |
+
result = crew.kickoff()
|
265 |
+
return format_legal_response(result, 'ar' if is_arabic(query) else 'en')
|
266 |
+
|
267 |
+
# Judge Tab
|
268 |
+
with tab2:
|
269 |
+
st.header("استشارة القاضي الإماراتي")
|
270 |
+
judge_query = st.text_area("اكتب سؤالك القانوني للقاضي:", key="judge_input", placeholder="أدخل النص هنا...")
|
271 |
+
st.markdown(
|
272 |
+
"""
|
273 |
+
<style>
|
274 |
+
.element-container textarea {
|
275 |
+
direction: rtl;
|
276 |
+
text-align: right;
|
277 |
+
}
|
278 |
+
</style>
|
279 |
+
""",
|
280 |
+
unsafe_allow_html=True
|
281 |
+
)
|
282 |
+
if st.button("الحصول على رأي القاضي", key="judge_button"):
|
283 |
+
if judge_query:
|
284 |
+
with st.spinner("القاضي يحلل قضيتك..."):
|
285 |
+
judge_agent = create_judge_agent()
|
286 |
+
response = get_agent_response(judge_agent, judge_query, selected_category)
|
287 |
+
st.session_state.chat_history.append(("القاضي", judge_query, response))
|
288 |
+
st.write("رد القاضي:")
|
289 |
+
st.markdown(response, unsafe_allow_html=True)
|
290 |
+
|
291 |
+
# Advocate Tab
|
292 |
+
with tab3:
|
293 |
+
st.header("استشارة المحامي الإماراتي")
|
294 |
+
advocate_query = st.text_area("اكتب سؤالك القانوني للمحامي:", key="advocate_input", placeholder="أدخل النص هنا...")
|
295 |
+
st.markdown(
|
296 |
+
"""
|
297 |
+
<style>
|
298 |
+
.element-container textarea {
|
299 |
+
direction: rtl;
|
300 |
+
text-align: right;
|
301 |
+
}
|
302 |
+
</style>
|
303 |
+
""",
|
304 |
+
unsafe_allow_html=True
|
305 |
+
)
|
306 |
+
if st.button("الحصول على رأي المحامي", key="advocate_button"):
|
307 |
+
if advocate_query:
|
308 |
+
with st.spinner("المحامي يحلل قضيتك..."):
|
309 |
+
advocate_agent = create_advocate_agent()
|
310 |
+
response = get_agent_response(advocate_agent, advocate_query, selected_category)
|
311 |
+
st.session_state.chat_history.append(("المحامي", advocate_query, response))
|
312 |
+
st.write("رد المحامي:")
|
313 |
+
st.markdown(response, unsafe_allow_html=True)
|
314 |
+
|
315 |
+
# Consultant Tab
|
316 |
+
with tab4:
|
317 |
+
st.header("استشارة المستشار القضائي الإماراتي")
|
318 |
+
consultant_query = st.text_area("اكتب سؤالك القانوني للمستشار:", key="consultant_input", placeholder="أدخل النص هنا...")
|
319 |
+
st.markdown(
|
320 |
+
"""
|
321 |
+
<style>
|
322 |
+
.element-container textarea {
|
323 |
+
direction: rtl;
|
324 |
+
text-align: right;
|
325 |
+
}
|
326 |
+
</style>
|
327 |
+
""",
|
328 |
+
unsafe_allow_html=True
|
329 |
+
)
|
changes.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acc5c62af6ddd4cd86812758b4f2f6c999eddbfb0e3a6e3d2ff2412ad9c991ab
|
3 |
+
size 5186
|
config.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
# OpenAI Configuration
|
7 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
8 |
+
|
9 |
+
# Language Settings
|
10 |
+
DEFAULT_LANGUAGE = 'ar' # Arabic by default
|
11 |
+
SUPPORTED_LANGUAGES = ['ar', 'en']
|
12 |
+
|
13 |
+
# UAE Legal Resources
|
14 |
+
UAE_LEGAL_DOMAINS = [
|
15 |
+
'https://elaws.moj.gov.ae',
|
16 |
+
'https://www.mohre.gov.ae',
|
17 |
+
'https://www.dm.gov.ae',
|
18 |
+
'https://www.adjd.gov.ae',
|
19 |
+
'https://www.dc.gov.ae'
|
20 |
+
|
21 |
+
]
|
22 |
+
|
23 |
+
# Legal Categories
|
24 |
+
LEGAL_CATEGORIES = {
|
25 |
+
'civil': 'القانون المدني',
|
26 |
+
'criminal': 'القانون الجنائي',
|
27 |
+
'commercial': 'القانون التجاري',
|
28 |
+
'labor': 'قانون العمل',
|
29 |
+
'family': 'قانون الأسرة',
|
30 |
+
'property': 'قانون العقارات'
|
31 |
+
}
|
document_exporter.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
try:
|
2 |
+
from docx import Document
|
3 |
+
except ImportError:
|
4 |
+
from docx.api import Document
|
5 |
+
from reportlab.pdfgen import canvas
|
6 |
+
from reportlab.lib.pagesizes import letter
|
7 |
+
from reportlab.pdfbase import pdfmetrics
|
8 |
+
from reportlab.pdfbase.ttfonts import TTFont
|
9 |
+
import io
|
10 |
+
import arabic_reshaper
|
11 |
+
from bidi.algorithm import get_display
|
12 |
+
|
13 |
+
class DocumentExporter:
|
14 |
+
def __init__(self):
|
15 |
+
# Register Arabic font for PDF
|
16 |
+
try:
|
17 |
+
pdfmetrics.registerFont(TTFont('Arabic', 'fonts/NotoNaskhArabic-Regular.ttf'))
|
18 |
+
except:
|
19 |
+
pass # Fall back to default font if Arabic font is not available
|
20 |
+
|
21 |
+
def export_to_pdf(self, content: dict) -> bytes:
|
22 |
+
"""Export the analysis results to PDF format."""
|
23 |
+
buffer = io.BytesIO()
|
24 |
+
c = canvas.Canvas(buffer, pagesize=letter)
|
25 |
+
|
26 |
+
# Set font for Arabic text
|
27 |
+
try:
|
28 |
+
c.setFont('Arabic', 14)
|
29 |
+
except:
|
30 |
+
c.setFont('Helvetica', 14)
|
31 |
+
|
32 |
+
y = 750 # Starting y position
|
33 |
+
|
34 |
+
# Add title
|
35 |
+
title = "تحليل المستند القانوني"
|
36 |
+
title = get_display(arabic_reshaper.reshape(title))
|
37 |
+
c.drawString(500, y, title)
|
38 |
+
y -= 30
|
39 |
+
|
40 |
+
# Add summary
|
41 |
+
summary_title = "ملخص المستند"
|
42 |
+
summary_title = get_display(arabic_reshaper.reshape(summary_title))
|
43 |
+
c.drawString(500, y, summary_title)
|
44 |
+
y -= 20
|
45 |
+
|
46 |
+
summary_text = get_display(arabic_reshaper.reshape(content['summary']))
|
47 |
+
# Wrap text to fit page width
|
48 |
+
words = summary_text.split()
|
49 |
+
line = ""
|
50 |
+
for word in words:
|
51 |
+
if c.stringWidth(line + word, 'Arabic', 12) < 500:
|
52 |
+
line += word + " "
|
53 |
+
else:
|
54 |
+
c.drawString(50, y, line)
|
55 |
+
y -= 15
|
56 |
+
line = word + " "
|
57 |
+
if line:
|
58 |
+
c.drawString(50, y, line)
|
59 |
+
y -= 30
|
60 |
+
|
61 |
+
# Add legal analysis
|
62 |
+
analysis_title = "تحليل المخالفات القانونية"
|
63 |
+
analysis_title = get_display(arabic_reshaper.reshape(analysis_title))
|
64 |
+
c.drawString(500, y, analysis_title)
|
65 |
+
y -= 20
|
66 |
+
|
67 |
+
analysis_text = get_display(arabic_reshaper.reshape(content['legal_analysis']))
|
68 |
+
words = analysis_text.split()
|
69 |
+
line = ""
|
70 |
+
for word in words:
|
71 |
+
if c.stringWidth(line + word, 'Arabic', 12) < 500:
|
72 |
+
line += word + " "
|
73 |
+
else:
|
74 |
+
c.drawString(50, y, line)
|
75 |
+
y -= 15
|
76 |
+
line = word + " "
|
77 |
+
if line:
|
78 |
+
c.drawString(50, y, line)
|
79 |
+
y -= 30
|
80 |
+
|
81 |
+
# Add legislation mapping
|
82 |
+
mapping_title = "الخريطة التشريعية"
|
83 |
+
mapping_title = get_display(arabic_reshaper.reshape(mapping_title))
|
84 |
+
c.drawString(500, y, mapping_title)
|
85 |
+
y -= 20
|
86 |
+
|
87 |
+
mapping_text = get_display(arabic_reshaper.reshape(content['legislation_mapping']))
|
88 |
+
words = mapping_text.split()
|
89 |
+
line = ""
|
90 |
+
for word in words:
|
91 |
+
if c.stringWidth(line + word, 'Arabic', 12) < 500:
|
92 |
+
line += word + " "
|
93 |
+
else:
|
94 |
+
c.drawString(50, y, line)
|
95 |
+
y -= 15
|
96 |
+
line = word + " "
|
97 |
+
if line:
|
98 |
+
c.drawString(50, y, line)
|
99 |
+
|
100 |
+
c.save()
|
101 |
+
return buffer.getvalue()
|
102 |
+
|
103 |
+
def export_to_word(self, content: dict) -> bytes:
|
104 |
+
"""Export the analysis results to Word format."""
|
105 |
+
doc = Document()
|
106 |
+
|
107 |
+
# Add title
|
108 |
+
doc.add_heading("تحليل المستند القانوني", 0)
|
109 |
+
|
110 |
+
# Add summary section
|
111 |
+
doc.add_heading("ملخص المستند", level=1)
|
112 |
+
doc.add_paragraph(content['summary'])
|
113 |
+
|
114 |
+
# Add legal analysis section
|
115 |
+
doc.add_heading("تحليل المخالفات القانونية", level=1)
|
116 |
+
doc.add_paragraph(content['legal_analysis'])
|
117 |
+
|
118 |
+
# Add legislation mapping section
|
119 |
+
doc.add_heading("الخريطة التشريعية", level=1)
|
120 |
+
doc.add_paragraph(content['legislation_mapping'])
|
121 |
+
|
122 |
+
# Save to bytes
|
123 |
+
buffer = io.BytesIO()
|
124 |
+
doc.save(buffer)
|
125 |
+
return buffer.getvalue()
|
fonts/NotoNaskhArabic-Regular.ttf
ADDED
Binary file (178 kB). View file
|
|
fonts/NotoNaskhArabic.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e9aa38292c3aef5a8e3bd28b30945638d49bbc028517c71c645d3bd90a957b0
|
3 |
+
size 21347
|
hassan_github
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN OPENSSH PRIVATE KEY-----
|
2 |
+
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAACFwAAAAdzc2gtcn
|
3 |
+
NhAAAAAwEAAQAAAgEAsFKJwQWnqkffXdE3djKsicWXO2zUwKsdyMCcpQNpHsjm8tp8Gzne
|
4 |
+
EaES6wrDV98UEA4KZyl3PsLcIrJE2srKaVpoaYe1UpJlVhTmo+oILmH0kt4321q1w6YQMT
|
5 |
+
2SB6yfc2XnhhwRtHCzPhGFygI8tdPcwP4HPJJwQUZSt8wVGROLphrXBSrvnZNF3yAJa+4o
|
6 |
+
/J+bG6qLFvXqZnjdiOe1wY596bRs5KeuHM41cyRbmPtUlSPzaEy3Non7rTp0s+sL6pmIPY
|
7 |
+
gUQ5fdMy8svlJndjIJg348nHfW7DZUHSaMfQgCOXOhwTbY/mEFwfM0nz5IWKPr+kjliMkO
|
8 |
+
GHKizPhqBTwlCVPFw2sifV7305ON+yUptYczksjpv1dlw6SlUxxyTKkTbnV7n7YiNxwbTa
|
9 |
+
LkHfD283zFH5ziMkU1fuiGh9NPzPE/sa80dR6EwnB0XLBojyJGVD39jJ4X4wb2G+MJik7U
|
10 |
+
fgP22najOSSoK5xuFbW50rRlUp6n30bgrH4f/kiytQUAUhtB7YIt1YU9VUf/Z6S4ab1SL5
|
11 |
+
w+BLVekZVnigr/lYCTzTo+tIvkEVCbbiGAaof6EO5Zs4q7D22zXPm0E9eZb6d2V8/vPWPA
|
12 |
+
+gIjaTPseqEsnxj+2b6ewScPbYkHnI0oomTQwWIRAfrxpi+iyKc23mf4q6nAjyvRKXSVWD
|
13 |
+
UAAAdQVunz01bp89MAAAAHc3NoLXJzYQAAAgEAsFKJwQWnqkffXdE3djKsicWXO2zUwKsd
|
14 |
+
yMCcpQNpHsjm8tp8GzneEaES6wrDV98UEA4KZyl3PsLcIrJE2srKaVpoaYe1UpJlVhTmo+
|
15 |
+
oILmH0kt4321q1w6YQMT2SB6yfc2XnhhwRtHCzPhGFygI8tdPcwP4HPJJwQUZSt8wVGROL
|
16 |
+
phrXBSrvnZNF3yAJa+4o/J+bG6qLFvXqZnjdiOe1wY596bRs5KeuHM41cyRbmPtUlSPzaE
|
17 |
+
y3Non7rTp0s+sL6pmIPYgUQ5fdMy8svlJndjIJg348nHfW7DZUHSaMfQgCOXOhwTbY/mEF
|
18 |
+
wfM0nz5IWKPr+kjliMkOGHKizPhqBTwlCVPFw2sifV7305ON+yUptYczksjpv1dlw6SlUx
|
19 |
+
xyTKkTbnV7n7YiNxwbTaLkHfD283zFH5ziMkU1fuiGh9NPzPE/sa80dR6EwnB0XLBojyJG
|
20 |
+
VD39jJ4X4wb2G+MJik7UfgP22najOSSoK5xuFbW50rRlUp6n30bgrH4f/kiytQUAUhtB7Y
|
21 |
+
It1YU9VUf/Z6S4ab1SL5w+BLVekZVnigr/lYCTzTo+tIvkEVCbbiGAaof6EO5Zs4q7D22z
|
22 |
+
XPm0E9eZb6d2V8/vPWPA+gIjaTPseqEsnxj+2b6ewScPbYkHnI0oomTQwWIRAfrxpi+iyK
|
23 |
+
c23mf4q6nAjyvRKXSVWDUAAAADAQABAAACAAu9AvS5tqbMcB9jzUhuKTRm1iGbpjJJcgsq
|
24 |
+
X4NQzc/B2jYyu25olNMhoQvKxKR18nT7KlAh35FrEZKxwYm7VGxdG3RjF3wuyNZJP+2LqA
|
25 |
+
3GcazRZHCTAmTLCmrsyWr/YAIjt50jAz66/gPU0M5ZBcepGhozDzJGIXkHAHzB9mmb9oER
|
26 |
+
al0qtZcM9erbzCTGTy46Re46lVXq+zblNwJlQqFnJhTH8TPrwdijFcXblsmBeekP4qV/7f
|
27 |
+
aQixPGP3y22i09GWfXRloVoyFEM7tb6w1gWWfKoKheBg3ltAXYyiOMw/ElNJCTYTDWLrbx
|
28 |
+
xQAQ+moER7J65eUMVYblItd4dj5UpCoprRsLjujXETT92N0FNm1H5TH8tw3Yuiwvkisetc
|
29 |
+
yo+kbe3JXD7UWNcW3y09zZ6cga/wc8KK6yiK+hd4zw4+rMB1LB/gtT8PmBHXUA4MvvQTvv
|
30 |
+
rlLFR6NYDIR8+V0UmOfXKyQu14pUyReHwv6WOEtM5SfzIZonXV/uICcle+k+OjNs5btO5V
|
31 |
+
oaDNWV9rqA+6ec0nPC/vMUlxlTg+kM2KX9lhwP7li8OzwberUcu7s1CxKuBJfFKXyrV+Yo
|
32 |
+
ZRHurlHSrlGkJH5KlA2TvixZjiF2CoRwJ6opFJMNyTNJkT7G7iFHBXX0rEsiwg2HqPzEOe
|
33 |
+
x9AnLUPWmiIkW6tyjlAAABADgR9qHep88KTbiyj+YgCVapkrCGQ7K9MAqjUf32mZhcWwv+
|
34 |
+
iDZn6AF5UTq4BuKgOt8snwShWnUHjC6HyxdTfAyC2U2Xjjc/hkL6st1V5b1B2iCweaKG4j
|
35 |
+
rnGUMqqvDrIA9H3lDfxNeKpfW9VympD4gtWkbXMQ9WqrSLihG5UzJOWvdXAVW3+sDDRXJY
|
36 |
+
kAqhBqA7pDYSFHOgNVJhEkKR/qnHI1CZn4Ts+OqSMwBssMTuwHvcfKWCgYlUBBQgD06vu6
|
37 |
+
+q9SSl8k15+b6vfu5Nk38j0E+Z3oqW4ZrBu+a1B5BelzGYud+b+X5yii19yXVoPAfvOcZb
|
38 |
+
wqqhnwDV3H6CHowAAAEBAODjL55u3FZsrmO0WakDDXpkBRTJF0LQnm6sl/D+TRuSTtURu3
|
39 |
+
KKm/LAT673heS90c3PP5Ja6aFwwZ+BTtqlX51CI/8kmqOWm25I03NXUfexbajm7tNEDHXt
|
40 |
+
SR2Db4HlzBK4Rv5f0ORQj9rrBOvaF4xkUl2P4l+Cm3McErVRNAB9E6Kvhpq6/OrzUOGudy
|
41 |
+
owYeC37CdjfGdVE73lP305hdu1bPdsHQkupB6PzG8QtfzvqGc2LPWS1BsgGWiHD/w8dqRO
|
42 |
+
WnY8xycZE5M8dIXPuPN6VNQGkVVUmREb0UcKwXnCaDwfSGxed1F2N60rggtK4BlZcvZSK8
|
43 |
+
mQK2xaRQGB9lsAAAEBAMi3VGhXgXmiXRghB8opmA4zUwRfOUjlwTj9akO87Wz1nvag8+Xt
|
44 |
+
3j1dtb7CECIGobLuUTyrcbPdPB7JeibsVxEYposQqIqZ9YBWXrN6ciVzyFfq+SqEyKTizg
|
45 |
+
ir2aXbZLLeo5EP40H0PiDCPMKeiK5bkrGE0+tEK4mhUdG4tlqSS2quG+GgGXNW2799wX6p
|
46 |
+
vH9Ir1B1CDNmvIm9n6CG70+FIvKlr0AeSHyWJDc389ktTqsa3QrzPmMIookKpoLhnvmo0P
|
47 |
+
P1qLB02rdTZrFxUGRxvGkb0O/1dK3SBfl35+wIsKOZF/O1iE+5fRWNU6CHSv+ng5A5UHwE
|
48 |
+
J+0Afmn80K8AAAAYaGFzc2FuLmtod2lsZWhAZ21haWwuY29tAQID
|
49 |
+
-----END OPENSSH PRIVATE KEY-----
|
hassan_github.pub
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCwUonBBaeqR99d0Td2MqyJxZc7bNTAqx3IwJylA2keyOby2nwbOd4RoRLrCsNX3xQQDgpnKXc+wtwiskTaysppWmhph7VSkmVWFOaj6gguYfSS3jfbWrXDphAxPZIHrJ9zZeeGHBG0cLM+EYXKAjy109zA/gc8knBBRlK3zBUZE4umGtcFKu+dk0XfIAlr7ij8n5sbqosW9epmeN2I57XBjn3ptGzkp64czjVzJFuY+1SVI/NoTLc2ifutOnSz6wvqmYg9iBRDl90zLyy+Umd2MgmDfjycd9bsNlQdJox9CAI5c6HBNtj+YQXB8zSfPkhYo+v6SOWIyQ4YcqLM+GoFPCUJU8XDayJ9XvfTk437JSm1hzOSyOm/V2XDpKVTHHJMqRNudXuftiI3HBtNouQd8PbzfMUfnOIyRTV+6IaH00/M8T+xrzR1HoTCcHRcsGiPIkZUPf2MnhfjBvYb4wmKTtR+A/badqM5JKgrnG4VtbnStGVSnqffRuCsfh/+SLK1BQBSG0Htgi3VhT1VR/9npLhpvVIvnD4EtV6RlWeKCv+VgJPNOj60i+QRUJtuIYBqh/oQ7lmzirsPbbNc+bQT15lvp3ZXz+89Y8D6AiNpM+x6oSyfGP7Zvp7BJw9tiQecjSiiZNDBYhEB+vGmL6LIpzbeZ/irqcCPK9EpdJVYNQ== [email protected]
|
pdf_processor.py
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import pytesseract
|
3 |
+
from pdf2image import convert_from_bytes
|
4 |
+
import arabic_reshaper
|
5 |
+
from bidi.algorithm import get_display
|
6 |
+
from transformers import pipeline
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
import io
|
9 |
+
import os
|
10 |
+
from typing import List, Dict
|
11 |
+
from agents import create_judge_agent, create_advocate_agent
|
12 |
+
from crewai import Task, Crew
|
13 |
+
|
14 |
+
class PDFProcessor:
|
15 |
+
def __init__(self):
|
16 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
17 |
+
chunk_size=500, # Reduced chunk size for better memory management
|
18 |
+
chunk_overlap=50,
|
19 |
+
length_function=len,
|
20 |
+
separators=["\n\n", "\n", " ", ""]
|
21 |
+
)
|
22 |
+
# Initialize models with better memory management
|
23 |
+
self.summarizer = pipeline(
|
24 |
+
"summarization",
|
25 |
+
model="facebook/bart-large-cnn",
|
26 |
+
device_map="auto", # Automatically choose best device
|
27 |
+
torch_dtype=torch.float32, # Use float32 for better memory efficiency
|
28 |
+
batch_size=1 # Process one chunk at a time
|
29 |
+
)
|
30 |
+
self.progress_callback = None
|
31 |
+
|
32 |
+
# Configure torch for memory efficiency
|
33 |
+
if torch.backends.mps.is_available(): # For Mac M1/M2
|
34 |
+
torch.backends.mps.set_per_process_memory_fraction(0.7) # Use only 70% of available memory
|
35 |
+
elif torch.cuda.is_available(): # For CUDA devices
|
36 |
+
torch.cuda.empty_cache()
|
37 |
+
torch.cuda.set_per_process_memory_fraction(0.7)
|
38 |
+
|
39 |
+
def set_progress_callback(self, callback):
|
40 |
+
"""Set a callback function to report progress."""
|
41 |
+
self.progress_callback = callback
|
42 |
+
|
43 |
+
def update_progress(self, message: str, progress: float):
|
44 |
+
"""Update progress through callback if available."""
|
45 |
+
if self.progress_callback:
|
46 |
+
self.progress_callback(message, progress)
|
47 |
+
|
48 |
+
def extract_text_from_pdf(self, pdf_bytes: bytes) -> str:
|
49 |
+
"""Extract text from PDF, handling both searchable and scanned PDFs with improved accuracy."""
|
50 |
+
text = ""
|
51 |
+
try:
|
52 |
+
# Try to extract text directly first using PyPDF2
|
53 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
54 |
+
extracted_text = []
|
55 |
+
|
56 |
+
for page in pdf_reader.pages:
|
57 |
+
page_text = page.extract_text()
|
58 |
+
if page_text.strip():
|
59 |
+
extracted_text.append(page_text)
|
60 |
+
|
61 |
+
# If direct extraction yielded results, process it
|
62 |
+
if extracted_text:
|
63 |
+
text = "\n\n".join(extracted_text)
|
64 |
+
else:
|
65 |
+
# If no text was extracted, use OCR with improved settings
|
66 |
+
images = convert_from_bytes(pdf_bytes, dpi=300) # Higher DPI for better quality
|
67 |
+
for image in images:
|
68 |
+
# Configure tesseract for better Arabic text recognition
|
69 |
+
custom_config = r'--oem 1 --psm 3 -l ara+eng'
|
70 |
+
page_text = pytesseract.image_to_string(
|
71 |
+
image,
|
72 |
+
config=custom_config,
|
73 |
+
lang='ara+eng'
|
74 |
+
)
|
75 |
+
if page_text.strip():
|
76 |
+
extracted_text.append(page_text)
|
77 |
+
|
78 |
+
text = "\n\n".join(extracted_text)
|
79 |
+
|
80 |
+
# Clean up the text
|
81 |
+
text = self._clean_text(text)
|
82 |
+
|
83 |
+
# Handle Arabic text with improved reshaping
|
84 |
+
text = self._process_arabic_text(text)
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
raise Exception(f"Error processing PDF: {str(e)}")
|
88 |
+
|
89 |
+
return text
|
90 |
+
|
91 |
+
def _clean_text(self, text: str) -> str:
|
92 |
+
"""Clean and normalize extracted text."""
|
93 |
+
# Remove control characters
|
94 |
+
text = "".join(char for char in text if char.isprintable() or char in "\n\r\t")
|
95 |
+
|
96 |
+
# Normalize whitespace
|
97 |
+
text = re.sub(r'\s+', ' ', text)
|
98 |
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
99 |
+
|
100 |
+
# Fix common OCR issues
|
101 |
+
text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
|
102 |
+
text = re.sub(r'([.!?])\s*(?=[A-Z])', r'\1\n', text)
|
103 |
+
|
104 |
+
# Remove empty lines and extra whitespace
|
105 |
+
lines = [line.strip() for line in text.split('\n')]
|
106 |
+
text = '\n'.join(line for line in lines if line)
|
107 |
+
|
108 |
+
return text.strip()
|
109 |
+
|
110 |
+
def _process_arabic_text(self, text: str) -> str:
|
111 |
+
"""Process Arabic text with improved handling."""
|
112 |
+
try:
|
113 |
+
# Configure arabic-reshaper for better text handling
|
114 |
+
configuration = {
|
115 |
+
'delete_harakat': False,
|
116 |
+
'support_ligatures': True,
|
117 |
+
'RIAL SIGN': True
|
118 |
+
}
|
119 |
+
|
120 |
+
# Reshape Arabic text
|
121 |
+
reshaped_text = arabic_reshaper.reshape(text, configuration=configuration)
|
122 |
+
|
123 |
+
# Apply bidirectional algorithm
|
124 |
+
text = get_display(reshaped_text)
|
125 |
+
|
126 |
+
# Fix common Arabic text issues
|
127 |
+
text = re.sub(r'([ء-ي])\s+([ء-ي])', r'\1\2', text) # Remove spaces between Arabic letters
|
128 |
+
text = re.sub(r'[\u200B-\u200F\u202A-\u202E]', '', text) # Remove Unicode control characters
|
129 |
+
|
130 |
+
return text
|
131 |
+
except Exception as e:
|
132 |
+
print(f"Warning: Error in Arabic text processing: {str(e)}")
|
133 |
+
return text # Return original text if processing fails
|
134 |
+
|
135 |
+
def summarize_document(self, text: str) -> str:
|
136 |
+
"""Generate a summary of the document with improved memory management."""
|
137 |
+
try:
|
138 |
+
# Split text into smaller chunks
|
139 |
+
chunks = self.text_splitter.split_text(text)
|
140 |
+
summaries = []
|
141 |
+
|
142 |
+
# Process chunks in batches to manage memory
|
143 |
+
batch_size = 3 # Process 3 chunks at a time
|
144 |
+
for i in range(0, len(chunks), batch_size):
|
145 |
+
# Clear GPU/MPS memory before processing new batch
|
146 |
+
if torch.cuda.is_available():
|
147 |
+
torch.cuda.empty_cache()
|
148 |
+
elif torch.backends.mps.is_available():
|
149 |
+
# Force garbage collection for MPS
|
150 |
+
import gc
|
151 |
+
gc.collect()
|
152 |
+
|
153 |
+
batch = chunks[i:i + batch_size]
|
154 |
+
for chunk in batch:
|
155 |
+
try:
|
156 |
+
# Generate summary with controlled length and parameters
|
157 |
+
summary = self.summarizer(
|
158 |
+
chunk,
|
159 |
+
max_length=130,
|
160 |
+
min_length=30,
|
161 |
+
do_sample=False,
|
162 |
+
num_beams=2, # Reduced beam search for memory efficiency
|
163 |
+
early_stopping=True
|
164 |
+
)
|
165 |
+
summaries.append(summary[0]['summary_text'])
|
166 |
+
except Exception as e:
|
167 |
+
print(f"Warning: Error summarizing chunk: {str(e)}")
|
168 |
+
# If summarization fails, include a portion of the original text
|
169 |
+
summaries.append(chunk[:200] + "...")
|
170 |
+
|
171 |
+
# Update progress
|
172 |
+
self.update_progress(
|
173 |
+
"جاري تلخيص المستند...",
|
174 |
+
min(0.3 + (i / len(chunks)) * 0.4, 0.7)
|
175 |
+
)
|
176 |
+
|
177 |
+
# Combine summaries intelligently
|
178 |
+
final_summary = " ".join(summaries)
|
179 |
+
|
180 |
+
# Clean up the final summary
|
181 |
+
final_summary = self._clean_text(final_summary)
|
182 |
+
final_summary = self._process_arabic_text(final_summary)
|
183 |
+
|
184 |
+
return final_summary
|
185 |
+
|
186 |
+
except Exception as e:
|
187 |
+
print(f"Error in summarization: {str(e)}")
|
188 |
+
# Fallback to a simple extractive summary
|
189 |
+
return self._create_extractive_summary(text)
|
190 |
+
|
191 |
+
def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str:
|
192 |
+
"""Create a simple extractive summary as a fallback method."""
|
193 |
+
try:
|
194 |
+
# Split text into sentences
|
195 |
+
sentences = re.split(r'[.!?]\s+', text)
|
196 |
+
|
197 |
+
# Remove very short sentences and clean
|
198 |
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
|
199 |
+
|
200 |
+
if not sentences:
|
201 |
+
return text[:500] + "..." # Return truncated text if no good sentences
|
202 |
+
|
203 |
+
# Score sentences based on position and length
|
204 |
+
scored_sentences = []
|
205 |
+
for i, sentence in enumerate(sentences):
|
206 |
+
score = 0
|
207 |
+
# Prefer sentences from the beginning and end of the document
|
208 |
+
if i < len(sentences) * 0.3: # First 30%
|
209 |
+
score += 2
|
210 |
+
elif i > len(sentences) * 0.7: # Last 30%
|
211 |
+
score += 1
|
212 |
+
|
213 |
+
# Prefer medium-length sentences
|
214 |
+
if 50 <= len(sentence) <= 200:
|
215 |
+
score += 1
|
216 |
+
|
217 |
+
scored_sentences.append((score, sentence))
|
218 |
+
|
219 |
+
# Sort by score and select top sentences
|
220 |
+
scored_sentences.sort(reverse=True)
|
221 |
+
selected_sentences = [s[1] for s in scored_sentences[:sentences_count]]
|
222 |
+
|
223 |
+
# Sort sentences by their original order
|
224 |
+
selected_sentences.sort(key=lambda s: sentences.index(s))
|
225 |
+
|
226 |
+
# Join sentences and clean
|
227 |
+
summary = ". ".join(selected_sentences)
|
228 |
+
summary = self._clean_text(summary)
|
229 |
+
summary = self._process_arabic_text(summary)
|
230 |
+
|
231 |
+
return summary
|
232 |
+
|
233 |
+
except Exception as e:
|
234 |
+
print(f"Error in extractive summary: {str(e)}")
|
235 |
+
return text[:500] + "..." # Return truncated text as last resort
|
236 |
+
|
237 |
+
def analyze_legal_issues(self, text: str) -> Dict:
|
238 |
+
"""Analyze legal issues in the document using the Judge agent."""
|
239 |
+
judge_agent = create_judge_agent()
|
240 |
+
|
241 |
+
task_description = f"""
|
242 |
+
تحليل المستند التالي وتحديد المخالفات القانونية المحتملة وفقاً للقوانين الإماراتية:
|
243 |
+
{text}
|
244 |
+
|
245 |
+
يجب أن يتضمن التحليل:
|
246 |
+
1. المخالفات القانونية المحتملة
|
247 |
+
2. المواد القانونية ذات الصلة
|
248 |
+
3. التوصيات للتصحيح
|
249 |
+
"""
|
250 |
+
|
251 |
+
task = Task(
|
252 |
+
description=task_description,
|
253 |
+
agent=judge_agent,
|
254 |
+
expected_output="تحليل قانوني شامل للمخالفات والتوصيات"
|
255 |
+
)
|
256 |
+
|
257 |
+
crew = Crew(agents=[judge_agent], tasks=[task])
|
258 |
+
result = crew.kickoff()
|
259 |
+
return {"legal_analysis": result}
|
260 |
+
|
261 |
+
def map_to_uae_legislation(self, text: str) -> Dict:
|
262 |
+
"""Map document content to relevant UAE laws and regulations."""
|
263 |
+
advocate_agent = create_advocate_agent()
|
264 |
+
|
265 |
+
task_description = f"""
|
266 |
+
تحليل المستند التالي وربطه بالقوانين والتشريعات الإماراتية ذات الصلة:
|
267 |
+
{text}
|
268 |
+
|
269 |
+
يجب أن يتضمن التحليل:
|
270 |
+
1. القوانين الإماراتية ذات الصلة
|
271 |
+
2. المواد القانونية المحددة
|
272 |
+
3. التفسير القانوني للعلاقة
|
273 |
+
"""
|
274 |
+
|
275 |
+
task = Task(
|
276 |
+
description=task_description,
|
277 |
+
agent=advocate_agent,
|
278 |
+
expected_output="خريطة تفصيلية للقوانين والتشريعات ذات الصلة"
|
279 |
+
)
|
280 |
+
|
281 |
+
crew = Crew(agents=[advocate_agent], tasks=[task])
|
282 |
+
result = crew.kickoff()
|
283 |
+
return {"legislation_mapping": result}
|
284 |
+
|
285 |
+
def process_document(self, pdf_bytes: bytes) -> Dict:
|
286 |
+
"""Process the document through all steps with progress tracking."""
|
287 |
+
try:
|
288 |
+
# Extract text from PDF
|
289 |
+
self.update_progress("استخراج النص من المستند...", 0.1)
|
290 |
+
text = self.extract_text_from_pdf(pdf_bytes)
|
291 |
+
|
292 |
+
if not text.strip():
|
293 |
+
raise ValueError("لم يتم العثور على نص قابل للقراءة في المستند")
|
294 |
+
|
295 |
+
# Generate summary
|
296 |
+
self.update_progress("إنشاء ملخص للمستند...", 0.3)
|
297 |
+
summary = self.summarize_document(text)
|
298 |
+
|
299 |
+
# Analyze legal issues
|
300 |
+
self.update_progress("تحليل القضايا القانونية...", 0.5)
|
301 |
+
legal_analysis = self.analyze_legal_issues(text)
|
302 |
+
|
303 |
+
# Map to UAE legislation
|
304 |
+
self.update_progress("ربط المستند بالتشريعات الإماراتية...", 0.7)
|
305 |
+
legislation_mapping = self.map_to_uae_legislation(text)
|
306 |
+
|
307 |
+
self.update_progress("اكتمل التحليل!", 1.0)
|
308 |
+
|
309 |
+
return {
|
310 |
+
"summary": summary,
|
311 |
+
"legal_analysis": legal_analysis["legal_analysis"],
|
312 |
+
"legislation_mapping": legislation_mapping["legislation_mapping"],
|
313 |
+
"raw_text": text # Include raw text for translation if needed
|
314 |
+
}
|
315 |
+
|
316 |
+
except Exception as e:
|
317 |
+
self.update_progress(f"حدث خطأ: {str(e)}", 0)
|
318 |
+
raise
|
requirements.txt
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core app dependencies
|
2 |
+
streamlit>=1.20.0
|
3 |
+
fastapi>=0.100.0,<1.0.0
|
4 |
+
langchain>=0.94.0,<0.95.0
|
5 |
+
openai>=0.27.0
|
6 |
+
chromadb==0.4.24
|
7 |
+
|
8 |
+
# PDF and OCR processing
|
9 |
+
PyPDF2>=3.0.0
|
10 |
+
pytesseract
|
11 |
+
pdf2image
|
12 |
+
reportlab
|
13 |
+
|
14 |
+
# NLP and Transformers
|
15 |
+
transformers>=4.30.0,<5.0.0
|
16 |
+
torch>=2.0.0,<3.0.0
|
17 |
+
sentencepiece
|
18 |
+
sacremoses
|
19 |
+
langdetect
|
20 |
+
|
21 |
+
# Arabic text handling
|
22 |
+
arabic-reshaper
|
23 |
+
python-bidi
|
24 |
+
python-docx
|
25 |
+
|
26 |
+
# SQLite utilities
|
27 |
+
sqlite-utils
|
style.css
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* Arabic font and RTL support */
|
2 |
+
@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@400;700&display=swap');
|
3 |
+
|
4 |
+
.rtl-text {
|
5 |
+
direction: rtl;
|
6 |
+
text-align: right;
|
7 |
+
font-family: 'Cairo', sans-serif;
|
8 |
+
}
|
9 |
+
|
10 |
+
/* Custom styling for the app */
|
11 |
+
.stApp {
|
12 |
+
font-family: 'Cairo', sans-serif;
|
13 |
+
}
|
14 |
+
|
15 |
+
.stTextArea {
|
16 |
+
direction: rtl;
|
17 |
+
}
|
18 |
+
|
19 |
+
.stButton button {
|
20 |
+
font-family: 'Cairo', sans-serif;
|
21 |
+
direction: rtl;
|
22 |
+
}
|
23 |
+
|
24 |
+
.stTab {
|
25 |
+
font-family: 'Cairo', sans-serif;
|
26 |
+
}
|
27 |
+
|
28 |
+
/* Legal response formatting */
|
29 |
+
.legal-response {
|
30 |
+
background-color: #f8f9fa;
|
31 |
+
border-radius: 5px;
|
32 |
+
padding: 15px;
|
33 |
+
margin: 10px 0;
|
34 |
+
border-right: 4px solid #2e7d32;
|
35 |
+
}
|
36 |
+
|
37 |
+
.legal-reference {
|
38 |
+
color: #1976d2;
|
39 |
+
font-weight: bold;
|
40 |
+
}
|
41 |
+
|
42 |
+
/* Category badges */
|
43 |
+
.category-badge {
|
44 |
+
background-color: #e3f2fd;
|
45 |
+
color: #1976d2;
|
46 |
+
padding: 5px 10px;
|
47 |
+
border-radius: 15px;
|
48 |
+
font-size: 0.8em;
|
49 |
+
margin: 5px;
|
50 |
+
display: inline-block;
|
51 |
+
}
|
translator.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import MarianMTModel, MarianTokenizer, pipeline
|
2 |
+
import torch
|
3 |
+
from langdetect import detect
|
4 |
+
import re
|
5 |
+
|
6 |
+
class Translator:
|
7 |
+
def __init__(self):
|
8 |
+
self.models = {}
|
9 |
+
self.tokenizers = {}
|
10 |
+
self.language_codes = {
|
11 |
+
'arabic': 'ar',
|
12 |
+
'english': 'en',
|
13 |
+
'chinese': 'zh',
|
14 |
+
'hindi': 'hi',
|
15 |
+
'urdu': 'ur'
|
16 |
+
}
|
17 |
+
|
18 |
+
# Initialize models for each language pair
|
19 |
+
self._load_model('en', 'ar') # English to Arabic
|
20 |
+
self._load_model('ar', 'en') # Arabic to English
|
21 |
+
# Add other language pairs as needed
|
22 |
+
|
23 |
+
def _load_model(self, src_lang, tgt_lang):
|
24 |
+
"""Load translation model for a specific language pair."""
|
25 |
+
model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
|
26 |
+
key = f'{src_lang}-{tgt_lang}'
|
27 |
+
|
28 |
+
if key not in self.models:
|
29 |
+
try:
|
30 |
+
self.tokenizers[key] = MarianTokenizer.from_pretrained(model_name)
|
31 |
+
self.models[key] = MarianMTModel.from_pretrained(model_name)
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error loading model for {key}: {str(e)}")
|
34 |
+
|
35 |
+
def translate(self, text: str, source_lang: str, target_lang: str) -> str:
|
36 |
+
"""Translate text from source language to target language with improved handling."""
|
37 |
+
src_code = self.language_codes.get(source_lang.lower())
|
38 |
+
tgt_code = self.language_codes.get(target_lang.lower())
|
39 |
+
|
40 |
+
if not src_code or not tgt_code:
|
41 |
+
raise ValueError("Unsupported language")
|
42 |
+
|
43 |
+
key = f'{src_code}-{tgt_code}'
|
44 |
+
|
45 |
+
if key not in self.models:
|
46 |
+
self._load_model(src_code, tgt_code)
|
47 |
+
|
48 |
+
if key not in self.models:
|
49 |
+
raise ValueError(f"Translation model not available for {source_lang} to {target_lang}")
|
50 |
+
|
51 |
+
tokenizer = self.tokenizers[key]
|
52 |
+
model = self.models[key]
|
53 |
+
|
54 |
+
try:
|
55 |
+
# Preprocess text
|
56 |
+
text = self.preprocess_text(text)
|
57 |
+
|
58 |
+
# Split text into manageable chunks
|
59 |
+
chunks = self._split_text_into_chunks(text)
|
60 |
+
translated_chunks = []
|
61 |
+
|
62 |
+
for chunk in chunks:
|
63 |
+
# Clear GPU memory
|
64 |
+
if torch.cuda.is_available():
|
65 |
+
torch.cuda.empty_cache()
|
66 |
+
|
67 |
+
# Tokenize with improved settings
|
68 |
+
inputs = tokenizer(
|
69 |
+
chunk,
|
70 |
+
return_tensors="pt",
|
71 |
+
padding=True,
|
72 |
+
truncation=True,
|
73 |
+
max_length=512,
|
74 |
+
add_special_tokens=True
|
75 |
+
)
|
76 |
+
|
77 |
+
# Generate translation with improved settings
|
78 |
+
with torch.no_grad():
|
79 |
+
translated = model.generate(
|
80 |
+
**inputs,
|
81 |
+
num_beams=2, # Reduced for memory efficiency
|
82 |
+
length_penalty=0.6,
|
83 |
+
max_length=512,
|
84 |
+
min_length=0,
|
85 |
+
early_stopping=True
|
86 |
+
)
|
87 |
+
|
88 |
+
# Decode the translation
|
89 |
+
result = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
|
90 |
+
translated_chunks.append(result)
|
91 |
+
|
92 |
+
# Combine chunks
|
93 |
+
final_translation = ' '.join(translated_chunks)
|
94 |
+
|
95 |
+
# Post-process translation
|
96 |
+
final_translation = self._post_process_translation(final_translation, target_lang)
|
97 |
+
|
98 |
+
return final_translation
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
print(f"Translation error: {str(e)}")
|
102 |
+
return text # Return original text if translation fails
|
103 |
+
|
104 |
+
def detect_language(self, text: str) -> str:
|
105 |
+
"""Detect the language of the input text."""
|
106 |
+
try:
|
107 |
+
# Clean text for better detection
|
108 |
+
cleaned_text = re.sub(r'[^\w\s]', '', text)
|
109 |
+
detected = detect(cleaned_text)
|
110 |
+
|
111 |
+
# Map detected language code to our supported languages
|
112 |
+
lang_code_map = {
|
113 |
+
'ar': 'arabic',
|
114 |
+
'en': 'english',
|
115 |
+
'zh': 'chinese',
|
116 |
+
'hi': 'hindi',
|
117 |
+
'ur': 'urdu'
|
118 |
+
}
|
119 |
+
|
120 |
+
return lang_code_map.get(detected, 'english') # Default to English if unknown
|
121 |
+
except:
|
122 |
+
return 'english' # Default to English if detection fails
|
123 |
+
|
124 |
+
def preprocess_text(self, text: str) -> str:
|
125 |
+
"""Preprocess text before translation."""
|
126 |
+
# Remove excessive whitespace
|
127 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
128 |
+
|
129 |
+
# Remove special characters that might interfere with translation
|
130 |
+
text = re.sub(r'[^\w\s\.,!?-]', '', text)
|
131 |
+
|
132 |
+
return text
|
133 |
+
|
134 |
+
def get_supported_languages(self):
|
135 |
+
"""Return list of supported languages."""
|
136 |
+
return list(self.language_codes.keys())
|
137 |
+
|
138 |
+
def _split_text_into_chunks(self, text: str, max_chunk_size: int = 450) -> list:
|
139 |
+
"""Split text into manageable chunks for translation."""
|
140 |
+
# First try to split by paragraphs
|
141 |
+
paragraphs = text.split('\n\n')
|
142 |
+
chunks = []
|
143 |
+
current_chunk = []
|
144 |
+
current_length = 0
|
145 |
+
|
146 |
+
for para in paragraphs:
|
147 |
+
# If a single paragraph is too long, split it by sentences
|
148 |
+
if len(para) > max_chunk_size:
|
149 |
+
sentences = re.split(r'([.!?])\s+', para)
|
150 |
+
i = 0
|
151 |
+
while i < len(sentences):
|
152 |
+
sentence = sentences[i]
|
153 |
+
if i + 1 < len(sentences):
|
154 |
+
sentence += sentences[i + 1] # Add back the punctuation
|
155 |
+
i += 2
|
156 |
+
else:
|
157 |
+
i += 1
|
158 |
+
|
159 |
+
if current_length + len(sentence) > max_chunk_size:
|
160 |
+
if current_chunk:
|
161 |
+
chunks.append(' '.join(current_chunk))
|
162 |
+
current_chunk = []
|
163 |
+
current_length = 0
|
164 |
+
|
165 |
+
current_chunk.append(sentence)
|
166 |
+
current_length += len(sentence)
|
167 |
+
else:
|
168 |
+
if current_length + len(para) > max_chunk_size:
|
169 |
+
chunks.append(' '.join(current_chunk))
|
170 |
+
current_chunk = []
|
171 |
+
current_length = 0
|
172 |
+
|
173 |
+
current_chunk.append(para)
|
174 |
+
current_length += len(para)
|
175 |
+
|
176 |
+
if current_chunk:
|
177 |
+
chunks.append(' '.join(current_chunk))
|
178 |
+
|
179 |
+
return chunks
|
180 |
+
|
181 |
+
def _post_process_translation(self, text: str, target_lang: str) -> str:
|
182 |
+
"""Post-process translated text based on target language."""
|
183 |
+
if target_lang.lower() in ['arabic', 'ar']:
|
184 |
+
# Fix Arabic-specific issues
|
185 |
+
text = re.sub(r'([ء-ي])\s+([ء-ي])', r'\1\2', text) # Remove spaces between Arabic letters
|
186 |
+
text = re.sub(r'[\u200B-\u200F\u202A-\u202E]', '', text) # Remove Unicode control characters
|
187 |
+
|
188 |
+
# Fix common Arabic punctuation issues
|
189 |
+
text = text.replace('،,', '،')
|
190 |
+
text = text.replace('.,', '.')
|
191 |
+
text = text.replace('؟?', '؟')
|
192 |
+
text = text.replace('!!', '!')
|
193 |
+
|
194 |
+
# Ensure proper spacing around numbers and Latin text
|
195 |
+
text = re.sub(r'([0-9])([ء-ي])', r'\1 \2', text)
|
196 |
+
text = re.sub(r'([ء-ي])([0-9])', r'\1 \2', text)
|
197 |
+
text = re.sub(r'([a-zA-Z])([ء-ي])', r'\1 \2', text)
|
198 |
+
text = re.sub(r'([ء-ي])([a-zA-Z])', r'\1 \2', text)
|
199 |
+
|
200 |
+
elif target_lang.lower() in ['english', 'en']:
|
201 |
+
# Fix English-specific issues
|
202 |
+
text = re.sub(r'\s+([.,!?])', r'\1', text) # Fix spacing before punctuation
|
203 |
+
text = re.sub(r'([.,!?])(?=[^\s])', r'\1 ', text) # Fix spacing after punctuation
|
204 |
+
text = re.sub(r'\s+', ' ', text) # Normalize spaces
|
205 |
+
text = text.replace(' ,', ',')
|
206 |
+
text = text.replace(' .', '.')
|
207 |
+
|
208 |
+
# Capitalize first letter of sentences
|
209 |
+
text = '. '.join(s.capitalize() for s in text.split('. '))
|
210 |
+
|
211 |
+
return text.strip()
|
212 |
+
|
213 |
+
def get_language_name(self, code: str) -> str:
|
214 |
+
"""Get the display name for a language code."""
|
215 |
+
names = {
|
216 |
+
'ar': 'العربية',
|
217 |
+
'en': 'English',
|
218 |
+
'zh': '中文',
|
219 |
+
'hi': 'हिंदी',
|
220 |
+
'ur': 'اردو'
|
221 |
+
}
|
222 |
+
return names.get(code, code)
|
utils.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Dict
|
2 |
+
import re
|
3 |
+
from langchain.tools import Tool
|
4 |
+
from config import UAE_LEGAL_DOMAINS
|
5 |
+
|
6 |
+
def is_arabic(text: str) -> bool:
|
7 |
+
"""Check if the text contains Arabic characters."""
|
8 |
+
arabic_pattern = re.compile('[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+')
|
9 |
+
return bool(arabic_pattern.search(text))
|
10 |
+
|
11 |
+
def create_uae_legal_tools() -> List[Tool]:
|
12 |
+
"""Create tools for UAE legal research."""
|
13 |
+
tools = [
|
14 |
+
Tool(
|
15 |
+
name="UAE Legal Database Search",
|
16 |
+
func=lambda q: search_uae_legal_database(q),
|
17 |
+
description="Search UAE legal databases for laws, regulations, and precedents"
|
18 |
+
),
|
19 |
+
Tool(
|
20 |
+
name="Arabic Legal Term Translation",
|
21 |
+
func=lambda q: translate_legal_term(q),
|
22 |
+
description="Translate legal terms between Arabic and English"
|
23 |
+
),
|
24 |
+
Tool(
|
25 |
+
name="UAE Case Law Search",
|
26 |
+
func=lambda q: search_uae_case_law(q),
|
27 |
+
description="Search UAE case law and legal precedents"
|
28 |
+
)
|
29 |
+
]
|
30 |
+
return tools
|
31 |
+
|
32 |
+
def search_uae_legal_database(query: str) -> str:
|
33 |
+
"""Simulate searching UAE legal databases."""
|
34 |
+
# In a real implementation, this would connect to actual UAE legal databases
|
35 |
+
return f"Found relevant UAE legal information for: {query}"
|
36 |
+
|
37 |
+
def translate_legal_term(term: str) -> str:
|
38 |
+
"""Simulate legal term translation."""
|
39 |
+
# In a real implementation, this would use a legal terms dictionary
|
40 |
+
return f"Translation for: {term}"
|
41 |
+
|
42 |
+
def search_uae_case_law(query: str) -> str:
|
43 |
+
"""Simulate searching UAE case law."""
|
44 |
+
# In a real implementation, this would search actual UAE case law databases
|
45 |
+
return f"Found relevant UAE case law for: {query}"
|
46 |
+
|
47 |
+
def format_legal_response(response: str, language: str = 'ar') -> str:
|
48 |
+
"""Format legal responses with proper styling and language direction."""
|
49 |
+
if language == 'ar':
|
50 |
+
return f'<div dir="rtl">{response}</div>'
|
51 |
+
return response
|