From 08bfc679ff51b319f4241f02069b44495153fd7a Mon Sep 17 00:00:00 2001 From: JasonJoosteCSIRO <120607685+JasonJoosteCSIRO@users.noreply.github.com> Date: Sat, 6 May 2023 05:23:32 +1000 Subject: [PATCH 1/3] Sequential cell ids (#184) * Add command line argument keep-id, which maintiains randomly generated cell ids. Otherwise cell ids are assigned incrementally (after the removal of cells), which should keep them consistent across runs in version control * Modify test_cell and test_exception in test_keep_output_tags.py to use the new strip_output signature * Fix failed test_end_to_end_nbstripout with test_max_size by passing --keep-id for keeping the existing ids * Add tests for notebooks with and without the --keep-id flag. A new extension expected_id was added for expected output with ordered ids * Modify the readme to include the --include-id flag * Add keyword arguments for None inputs in test_keep_output_tags.py * Rename expected output files to make desired sequential ids more explicit Co-authored-by: Florian Rathgeber --- README.rst | 4 + nbstripout/_nbstripout.py | 8 +- nbstripout/_utils.py | 8 +- ...test_max_size.ipynb.expected_sequential_id | 90 ++++++++++++++++++ tests/e2e_notebooks/test_nbformat45.ipynb | 93 +++++++++++++++++++ .../test_nbformat45.ipynb.expected | 61 ++++++++++++ ...st_nbformat45.ipynb.expected_sequential_id | 61 ++++++++++++ tests/test_end_to_end.py | 5 +- tests/test_keep_output_tags.py | 4 +- 9 files changed, 325 insertions(+), 9 deletions(-) create mode 100644 tests/e2e_notebooks/test_max_size.ipynb.expected_sequential_id create mode 100644 tests/e2e_notebooks/test_nbformat45.ipynb create mode 100644 tests/e2e_notebooks/test_nbformat45.ipynb.expected create mode 100644 tests/e2e_notebooks/test_nbformat45.ipynb.expected_sequential_id diff --git a/README.rst b/README.rst index a161c65..3d97290 100644 --- a/README.rst +++ b/README.rst @@ -274,6 +274,10 @@ Do not strip the output :: nbstripout --keep-output +Do not reassign the cell ids to be sequential :: + + nbstripout --keep-id + To mark special cells so that the output is not stripped, you can either: 1. Set the ``keep_output`` tag on the cell. To do this, enable the tags diff --git a/nbstripout/_nbstripout.py b/nbstripout/_nbstripout.py index 3957395..50f096a 100644 --- a/nbstripout/_nbstripout.py +++ b/nbstripout/_nbstripout.py @@ -373,6 +373,9 @@ def main(): help='Do not strip the execution count/prompt number') parser.add_argument('--keep-output', action='store_true', help='Do not strip output', default=None) + parser.add_argument('--keep-id', action='store_true', + help='Keep the randomly generated cell ids, ' + 'which will be different after each execution.') parser.add_argument('--extra-keys', default='', help='Space separated list of extra keys to strip ' 'from metadata, e.g. metadata.foo cell.metadata.bar') @@ -409,7 +412,6 @@ def main(): parser.add_argument('files', nargs='*', help='Files to strip output from') args = parser.parse_args() - git_config = ['git', 'config'] if args._system: @@ -487,7 +489,7 @@ def main(): warnings.simplefilter("ignore", category=UserWarning) nb = read(f, as_version=NO_CONVERT) - nb = strip_output(nb, args.keep_output, args.keep_count, extra_keys, args.drop_empty_cells, + nb = strip_output(nb, args.keep_output, args.keep_count, args.keep_id, extra_keys, args.drop_empty_cells, args.drop_tagged_cells.split(), args.strip_init_cells, _parse_size(args.max_size)) if args.dry_run: @@ -533,7 +535,7 @@ def main(): warnings.simplefilter("ignore", category=UserWarning) nb = read(input_stream, as_version=NO_CONVERT) - nb = strip_output(nb, args.keep_output, args.keep_count, extra_keys, args.drop_empty_cells, + nb = strip_output(nb, args.keep_output, args.keep_count, args.keep_id, extra_keys, args.drop_empty_cells, args.drop_tagged_cells.split(), args.strip_init_cells, _parse_size(args.max_size)) if args.dry_run: diff --git a/nbstripout/_utils.py b/nbstripout/_utils.py index d54ac91..322edbd 100644 --- a/nbstripout/_utils.py +++ b/nbstripout/_utils.py @@ -94,7 +94,7 @@ def strip_zeppelin_output(nb): return nb -def strip_output(nb, keep_output, keep_count, extra_keys=[], drop_empty_cells=False, drop_tagged_cells=[], +def strip_output(nb, keep_output, keep_count, keep_id, extra_keys=[], drop_empty_cells=False, drop_tagged_cells=[], strip_init_cells=False, max_size=0): """ Strip the outputs, execution count/prompt number and miscellaneous @@ -124,7 +124,7 @@ def strip_output(nb, keep_output, keep_count, extra_keys=[], drop_empty_cells=Fa for tag_to_drop in drop_tagged_cells: conditionals.append(lambda c: tag_to_drop not in c.get("metadata", {}).get("tags", [])) - for cell in _cells(nb, conditionals): + for i, cell in enumerate(_cells(nb, conditionals)): keep_output_this_cell = determine_keep_output(cell, keep_output, strip_init_cells) # Remove the outputs, unless directed otherwise @@ -148,7 +148,9 @@ def strip_output(nb, keep_output, keep_count, extra_keys=[], drop_empty_cells=Fa cell['prompt_number'] = None if 'execution_count' in cell and not keep_count: cell['execution_count'] = None - + # Replace the cell id with an incremental value that will be consistent across runs + if 'id' in cell and not keep_id: + cell['id'] = str(i) for field in keys['cell']: pop_recursive(cell, field) return nb diff --git a/tests/e2e_notebooks/test_max_size.ipynb.expected_sequential_id b/tests/e2e_notebooks/test_max_size.ipynb.expected_sequential_id new file mode 100644 index 0000000..44cddfb --- /dev/null +++ b/tests/e2e_notebooks/test_max_size.ipynb.expected_sequential_id @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "This notebook tests that outputs can be cleared based on size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aaaaaaaaaa\n" + ] + } + ], + "source": [ + "print(\"a\"*10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"a\"*100)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/e2e_notebooks/test_nbformat45.ipynb b/tests/e2e_notebooks/test_nbformat45.ipynb new file mode 100644 index 0000000..f6d3f16 --- /dev/null +++ b/tests/e2e_notebooks/test_nbformat45.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5c42035d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'This is the new Jupyter notebook'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"This is the new Jupyter notebook\"" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "886205fa", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'text2'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"text2\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a183d4e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "f(3) = 4\n" + ] + } + ], + "source": [ + "def f(x):\n", + " \"\"\"My function\n", + " x : parameter\"\"\"\n", + " \n", + " return x+1\n", + "\n", + "print(\"f(3) = \", f(3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/e2e_notebooks/test_nbformat45.ipynb.expected b/tests/e2e_notebooks/test_nbformat45.ipynb.expected new file mode 100644 index 0000000..fb78c3f --- /dev/null +++ b/tests/e2e_notebooks/test_nbformat45.ipynb.expected @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5c42035d", + "metadata": {}, + "outputs": [], + "source": [ + "\"This is the new Jupyter notebook\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "886205fa", + "metadata": {}, + "outputs": [], + "source": [ + "\"text2\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a183d4e9", + "metadata": {}, + "outputs": [], + "source": [ + "def f(x):\n", + " \"\"\"My function\n", + " x : parameter\"\"\"\n", + " \n", + " return x+1\n", + "\n", + "print(\"f(3) = \", f(3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/e2e_notebooks/test_nbformat45.ipynb.expected_sequential_id b/tests/e2e_notebooks/test_nbformat45.ipynb.expected_sequential_id new file mode 100644 index 0000000..8c499da --- /dev/null +++ b/tests/e2e_notebooks/test_nbformat45.ipynb.expected_sequential_id @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": {}, + "outputs": [], + "source": [ + "\"This is the new Jupyter notebook\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "\"text2\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "def f(x):\n", + " \"\"\"My function\n", + " x : parameter\"\"\"\n", + " \n", + " return x+1\n", + "\n", + "print(\"f(3) = \", f(3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index 0b96923..575dd9b 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -15,7 +15,8 @@ ("test_drop_tagged_cells.ipynb", "test_drop_tagged_cells_dontdrop.ipynb.expected", []), ("test_drop_tagged_cells.ipynb", "test_drop_tagged_cells.ipynb.expected", ['--drop-tagged-cells=test']), ("test_execution_timing.ipynb", "test_execution_timing.ipynb.expected", []), - ("test_max_size.ipynb", "test_max_size.ipynb.expected", ["--max-size", "50"]), + ("test_max_size.ipynb", "test_max_size.ipynb.expected", ["--max-size", "50", "--keep-id"]), + ("test_max_size.ipynb", "test_max_size.ipynb.expected_sequential_id", ["--max-size", "50"]), ("test_metadata.ipynb", "test_metadata.ipynb.expected", []), ("test_metadata.ipynb", "test_metadata_extra_keys.ipynb.expected", ["--extra-keys", "metadata.kernelspec metadata.language_info"]), ("test_metadata.ipynb", "test_metadata_keep_count.ipynb.expected", ["--keep-count"]), @@ -26,6 +27,8 @@ ("test_metadata_period.ipynb", "test_metadata_period.ipynb.expected", ["--extra-keys", "cell.metadata.application/vnd.databricks.v1+cell metadata.application/vnd.databricks.v1+notebook"]), ("test_strip_init_cells.ipynb", "test_strip_init_cells.ipynb.expected", ["--strip-init-cells"]), ("test_nbformat2.ipynb", "test_nbformat2.ipynb.expected", []), + ("test_nbformat45.ipynb", "test_nbformat45.ipynb.expected", ["--keep-id"]), + ("test_nbformat45.ipynb", "test_nbformat45.ipynb.expected_sequential_id", []), ("test_unicode.ipynb", "test_unicode.ipynb.expected", []), ("test_widgets.ipynb", "test_widgets.ipynb.expected", []), ("test_zeppelin.zpln", "test_zeppelin.zpln.expected", ["--mode", "zeppelin"]), diff --git a/tests/test_keep_output_tags.py b/tests/test_keep_output_tags.py index 7ec8567..4243a82 100644 --- a/tests/test_keep_output_tags.py +++ b/tests/test_keep_output_tags.py @@ -24,7 +24,7 @@ def nb_with_exception(): def test_cells(orig_nb): nb_stripped = deepcopy(orig_nb) - nb_stripped = strip_output(nb_stripped, None, None) + nb_stripped = strip_output(nb_stripped, keep_output=None, keep_count=None, keep_id=None) for i, cell in enumerate(nb_stripped.cells): if cell.cell_type == 'code' and cell.source: match = re.match(r"\s*#\s*(output|no_output)", cell.source) @@ -41,4 +41,4 @@ def test_cells(orig_nb): def test_exception(nb_with_exception): with pytest.raises(MetadataError): - strip_output(nb_with_exception, None, None) + strip_output(nb_with_exception, keep_output=None, keep_count=None, keep_id=None) From 749f431366811f26114c15aefcb5149e014b9e97 Mon Sep 17 00:00:00 2001 From: wpbonelli Date: Sun, 22 Oct 2023 02:40:32 -0400 Subject: [PATCH 2/3] Add python3.11 classifier to setup.py, remove python3.6 (#186) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dede8c5..5982ec0 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,10 @@ "Intended Audience :: Developers", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Software Development :: Version Control", ]) From 6f6303a0a6e8c3fda570612731ef20c265358379 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 13 Feb 2024 16:30:15 -0800 Subject: [PATCH 3/3] (wip) first commit --- nbstripout/_nbstripout.py | 55 ++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/nbstripout/_nbstripout.py b/nbstripout/_nbstripout.py index 50f096a..6f4c748 100644 --- a/nbstripout/_nbstripout.py +++ b/nbstripout/_nbstripout.py @@ -354,6 +354,9 @@ def status(git_config, install_location=INSTALL_LOCATION_LOCAL, verbose=False): def main(): parser = ArgumentParser(epilog=__doc__, formatter_class=RawDescriptionHelpFormatter) task = parser.add_mutually_exclusive_group() + task.add_argument('--verify', action='store_true', + help='Print which notebooks would have been stripped, ' + 'Like dry-run but returns an error if a file would have been changed') task.add_argument('--dry-run', action='store_true', help='Print which notebooks would have been stripped') task.add_argument('--install', action='store_true', @@ -469,6 +472,7 @@ def main(): input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') if sys.stdin else None output_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', newline='') + to_change_files = [] for filename in args.files: if not (args.force or filename.endswith('.ipynb') or filename.endswith('.zpln')): continue @@ -480,10 +484,17 @@ def main(): output_stream.write(f'Dry run: would have stripped {filename}\n') continue nb = json.load(f, object_pairs_hook=collections.OrderedDict) + pre_hash = hash(json.dumps(nb)) nb_stripped = strip_zeppelin_output(nb) - with open(filename, 'w') as f: - json.dump(nb_stripped, f, indent=2) + if not args.verify: + with open(filename, 'w') as f: + json.dump(nb_stripped, f, indent=2) + else: + post_hash = hash(json.dumps(nb_stripped)) + + if pre_hash != post_hash: + to_change_files.append(filename) continue with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) @@ -491,10 +502,10 @@ def main(): nb = strip_output(nb, args.keep_output, args.keep_count, args.keep_id, extra_keys, args.drop_empty_cells, args.drop_tagged_cells.split(), args.strip_init_cells, _parse_size(args.max_size)) + post_hash = hash(json.dumps(nb)) if args.dry_run: output_stream.write(f'Dry run: would have stripped {filename}\n') - continue if args.textconv: @@ -504,10 +515,15 @@ def main(): output_stream.flush() else: - with io.open(filename, 'w', encoding='utf8', newline='') as f: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UserWarning) - write(nb, f) + if args.verify: + if pre_hash != post_hash: + output_stream.write(f'Verify: would have stripped {filename}\n') + to_change_files.append(filename) + else: + with io.open(filename, 'w', encoding='utf8', newline='') as f: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + write(nb, f) except NotJSONError: print(f"'{filename}' is not a valid notebook", file=sys.stderr) raise SystemExit(1) @@ -519,28 +535,53 @@ def main(): print(f"Could not strip '{filename}'", file=sys.stderr) raise + if to_change_files: + raise SystemExit(1) + if not args.files and input_stream: try: if args.mode == 'zeppelin': if args.dry_run: output_stream.write('Dry run: would have stripped input from stdin\n') raise SystemExit(0) + nb = json.load(input_stream, object_pairs_hook=collections.OrderedDict) nb_stripped = strip_zeppelin_output(nb) + + if args.verify: + pre_hash = hash(json.dumps(nb)) + post_hash = hash(json.dumps(nb_stripped)) + if pre_hash != post_hash: + output_stream.write('Verify: would have stripped input from stdin\n') + raise SystemExit(1) + else: + raise SystemExit(0) + json.dump(nb_stripped, output_stream, indent=2) output_stream.write('\n') output_stream.flush() raise SystemExit(0) + with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) nb = read(input_stream, as_version=NO_CONVERT) + pre_hash = hash(json.dumps(nb)) nb = strip_output(nb, args.keep_output, args.keep_count, args.keep_id, extra_keys, args.drop_empty_cells, args.drop_tagged_cells.split(), args.strip_init_cells, _parse_size(args.max_size)) + post_hash = hash(json.dumps(nb)) if args.dry_run: output_stream.write('Dry run: would have stripped input from ' 'stdin\n') + elif args.verify: + if pre_hash != post_hash: + output_stream.write( + 'Verify: would have stripped input from stdin\n' + ) + raise SystemExit(1) + + output_stream.flush() else: with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning)