from markdownify import markdownify as md from bs4 import BeautifulSoup as BS from urllib.parse import urljoin from newspaper import Article import re import markdown def clean(s): s = s.replace("\t", "\\t") s = s.replace("\n", "\\n") return s class DocTree: def __init__(self, content): self.content = content self.max_depth = 6 def get_sections(self, *location_ids): out = self.content for id_ in location_ids: out = out[id_] return out def merge_sections(self, elems): if not isinstance(elems[0], list): return '\n\n '.join(elems) out = [] for e in elems: out.append(self.merge_sections(e)) return '\n\n '.join(map(clean, out)) def get_merged_sections(self, *location_ids): return [self.merge_sections(s) for s in self.get_sections(*location_ids)] def as_markdown(self, content): return md(content) def get_sections_by_depth(self, depth): return self._get_sections_by_depth(self.content, depth) @staticmethod def _get_sections_by_depth(content, depth): """Returns a list of merged sections at a specific depth""" if depth == 0: return content out = [] for elem in content: out += DocTree._get_sections_by_depth(elem, depth - 1) return out def fix_relative_links(url, article_content): if 'http' in url: base_url = '/'.join(url.split('/')[:3]) else: base_url = url.split('/') pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE) res = pat.findall(article_content) if res: for g in res: url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1] article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})') else:print('not found') return article_content def extract_article(url): article = Article(url) article.download() article.parse() return article def select_content(html_code, elem_class, class_name): print(f'Calling select_content with {elem_class}, {class_name}') kwargs = {} if class_name.startswith('.'): class_name = class_name[1:] kwargs = {'class_': class_name} elif class_name.startswith('#'): kwargs = {'id': class_name[1:]} return md(str(BS(html_code, features="lxml").find(**kwargs))) def split_by_heading(html_content, _i): if _i >= 7: return html_content elems = [] for idx, elem in enumerate([i for i in html_content.split(f' 0 or elem.startswith('>'): elem = f'