Ypy is deprecated and unmaintained. We have problem with parsing existing documents. We replace it by pycrdt, library actively maintained and without the issues we have with Ypy.
77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
"""Utils for the core app."""
|
|
|
|
import base64
|
|
import re
|
|
|
|
import pycrdt
|
|
from bs4 import BeautifulSoup
|
|
|
|
from core import enums
|
|
|
|
|
|
def filter_descendants(paths, root_paths, skip_sorting=False):
|
|
"""
|
|
Filters paths to keep only those that are descendants of any path in root_paths.
|
|
|
|
A path is considered a descendant of a root path if it starts with the root path.
|
|
If `skip_sorting` is not set to True, the function will sort both lists before
|
|
processing because both `paths` and `root_paths` need to be in lexicographic order
|
|
before going through the algorithm.
|
|
|
|
Args:
|
|
paths (iterable of str): List of paths to be filtered.
|
|
root_paths (iterable of str): List of paths to check as potential prefixes.
|
|
skip_sorting (bool): If True, assumes both `paths` and `root_paths` are already sorted.
|
|
|
|
Returns:
|
|
list of str: A list of sorted paths that are descendants of any path in `root_paths`.
|
|
"""
|
|
results = []
|
|
i = 0
|
|
n = len(root_paths)
|
|
|
|
if not skip_sorting:
|
|
paths.sort()
|
|
root_paths.sort()
|
|
|
|
for path in paths:
|
|
# Try to find a matching prefix in the sorted accessible paths
|
|
while i < n:
|
|
if path.startswith(root_paths[i]):
|
|
results.append(path)
|
|
break
|
|
if root_paths[i] < path:
|
|
i += 1
|
|
else:
|
|
# If paths[i] > path, no need to keep searching
|
|
break
|
|
return results
|
|
|
|
|
|
def base64_yjs_to_xml(base64_string):
|
|
"""Extract xml from base64 yjs document."""
|
|
|
|
decoded_bytes = base64.b64decode(base64_string)
|
|
# uint8_array = bytearray(decoded_bytes)
|
|
|
|
doc = pycrdt.Doc()
|
|
doc.apply_update(decoded_bytes)
|
|
return str(doc.get("document-store", type=pycrdt.XmlFragment))
|
|
|
|
|
|
def base64_yjs_to_text(base64_string):
|
|
"""Extract text from base64 yjs document."""
|
|
|
|
blocknote_structure = base64_yjs_to_xml(base64_string)
|
|
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
|
|
return soup.get_text(separator=" ", strip=True)
|
|
|
|
|
|
def extract_attachments(content):
|
|
"""Helper method to extract media paths from a document's content."""
|
|
if not content:
|
|
return []
|
|
|
|
xml_content = base64_yjs_to_xml(content)
|
|
return re.findall(enums.MEDIA_STORAGE_URL_EXTRACT, xml_content)
|