Source code for UniProtMapper.utils

"""Module with utility functions for the package."""

import json
import re
import zlib
from pathlib import Path
from typing import Optional

import pandas as pd
import requests


[docs] def get_resource_file(filename: str) -> str: try: import pkg_resources return pkg_resources.resource_filename("UniProtMapper", filename) except ImportError: from importlib import resources return str(resources.files("UniProtMapper") / filename)
[docs] def get_resources_root() -> Path: """Returns the path to the resources folder.""" return Path(get_resource_file("resources"))
[docs] def fetch_cross_referenced_db_details( output_path: Optional[str] = None, save: bool = True ) -> dict: """Downloads the latest details on UniProt cross references and stores it. This list of cross references can be found here: https://www.uniprot.org/database?query=* Args: output_path: the path to save the downloaded file with the cross references details. If left as None, will update the file stored in the package. Defaults to None. save: whether to save or not the retrieved json. Defaults to True. Returns: dict: the json with the cross references details. """ if output_path is None: output_path = get_resources_root() / "uniprot_crossref_details.json" else: output_path = Path(output_path) url = "https://rest.uniprot.org/database/stream?&download=true&format=json&query=%28*%29" # Make the GET request with modified Accept-Encoding header response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Write the content to a file if save: with output_path.open("w") as f: json.dump(response.json(), f, indent=4) print("File downloaded successfully.") else: print(f"Failed to download the file. Status code: {response.status_code}") return response.json()
[docs] def read_fields_table(): """Return the fields table from the package resources as a DataFrame containing rows as the information available in UniProt and the following columns: - `label`: the label of the information once retrieved from the API. - `returned_field`: the name used in the API to return that information. - `field_type`: the type of information, e.g.: sequence-related, function... - `has_full_version`: whether the annotated field contains the full version of the dataset or not (in case of cross-references). - `type`: the type of data. Either "cross_reference" or "uniprot_field".""" csv_path = get_resource_file("resources/uniprot_return_fields.csv") return pd.read_csv(csv_path)
[docs] def supported_mapping_dbs(): """Return a list of the supported datasets as UniProt cross references. This list is used to validate the arguments `to_db` and `from_db` in the `FieldRetriever.get()` method. """ _mapping_dbs_path = get_resource_file("resources/uniprot_mapping_dbs.json") with open(_mapping_dbs_path, "r") as f: dbs_dict = json.load(f) return sorted([dbs_dict[k][i] for k in dbs_dict for i in range(len(dbs_dict[k]))])
[docs] def decode_results(response, file_format, compressed): """Decodes the response from the UniProt API.""" if compressed: decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) if file_format == "json": j = json.loads(decompressed.decode("utf-8")) return j elif file_format == "tsv": return [line for line in decompressed.decode("utf-8").split("\n") if line] elif file_format == "xlsx": return [decompressed] elif file_format == "xml": return [decompressed.decode("utf-8")] else: return decompressed.decode("utf-8") elif file_format == "json": return response.json() elif file_format == "tsv": return [line for line in response.text.split("\n") if line] elif file_format == "xlsx": return [response.content] elif file_format == "xml": return [response.text] return response.text
[docs] def get_xml_namespace(element): """Get the namespace of an XML element.""" m = re.match(r"\{(.*)\}", element.tag) return m.groups()[0] if m else ""
[docs] def divide_batches(ids): """Divides a list of UniProtIDs into batches of 500""" return [ids[i : i + 500] for i in range(0, len(ids), 500)]