Skip to content

Commit

Permalink
Merge pull request #22 from NCATS-Tangerine/issue#10
Browse files Browse the repository at this point in the history
Command line wrapper
  • Loading branch information
cmungall authored May 2, 2018
2 parents 9398b8d + ceb1fe5 commit 09750a2
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 8 deletions.
52 changes: 47 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,52 @@ the

For additional background see the [Translator Knowledge Graph Drive](http://bit.ly/tr-kg)

## Command Line Usage
## Installation
```
pip install -r requirements.txt
python setup.py install
```

Coming soon. For now see the [tests](tests/) for example
## Command Line Usage
Use the `--help` flag to get help. Right now there is a single command:
```
Usage: kgx dump [OPTIONS] [INPUT]... OUTPUT
Transforms a knowledge graph from one representation to another
INPUT : any number of files or endpoints
OUTPUT : the output file
Options:
--input-type TEXT Extention type of input files: ttl, json, csv, rq, tsv,
graphml
--output-type TEXT Extention type of output files: ttl, json, csv, rq, tsv,
graphml
--help Show this message and exit.
```

CSV/TSV representation require two files, one that represents the vertex set and
one for the edge set. JSON, TTL, and GRAPHML files represent a whole graph in a
single file. For this reason when creating CSV/TSV representation we will zip
the resulting files in a .tar file.

The format will be inferred from the file extention. But if this cannot be done
then the `--input-type` and `--output-type` flags are useful to tell the program
what formats to use. Currently not all conversions are supported.

Here are some examples that mirror the [tests](tests/):

```
$ kgx dump --output-type=csv tests/resources/x1n.csv tests/resources/x1e.csv target/x1out
File created at: target/x1out.tar
$ kgx dump tests/resources/x1n.csv tests/resources/x1e.csv target/x1n.graphml
File created at: target/x1n.graphml
$ kgx dump tests/resources/monarch/biogrid_test.ttl target/bgcopy.csv
File created at: target/bgcopy.csv.tar
$ kgx dump tests/resources/monarch/biogrid_test.ttl target/x1n.graphml
File created at: target/x1n.graphml
$ kgx dump tests/resources/monarch/biogrid_test.ttl target/x1n.json
File created at: target/x1n.json
```

## Internal Representation

Expand All @@ -23,8 +66,8 @@ standard](http://bit.ly/tr-kg-standard), briefly summarized here:
* id : required
* name : string
* category : string. broad high level type. Corresponds to label in neo4j
* extensible other properties, depending on
* [Edges](https://biolink.github.io/biolink-model/docs/Association.html)
* extensible other properties, depending on
* [Edges](https://biolink.github.io/biolink-model/docs/Association.html)
* subject : required
* predicate : required
* object : required
Expand Down Expand Up @@ -53,4 +96,3 @@ Intended to support
Neo4j implements property graphs out the box. However, some
implementations use reification nodes. The transform should allow for
de-reification.

2 changes: 2 additions & 0 deletions kgx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
from .sparql_transformer import SparqlTransformer
from .rdf_transformer import ObanRdfTransformer
from .json_transformer import JsonTransformer

from .kgx import cli
104 changes: 104 additions & 0 deletions kgx/kgx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import kgx
import click
import os

_transformers = {
'csv' : kgx.PandasTransformer,
'tsv' : kgx.PandasTransformer,
'graphml' : kgx.GraphMLTransformer,
'ttl' : kgx.ObanRdfTransformer,
'json' : kgx.JsonTransformer,
'rq' : kgx.SparqlTransformer
}

_file_types = ', '.join(_transformers.keys())

class Config(object):
def __init__(self):
self.debug = False

pass_config = click.make_pass_decorator(Config, ensure=True)

@click.group()
@click.option('--debug', is_flag=True, help='Prints the stack trace if error occurs')
@click.version_option(version=kgx.__version__, prog_name=kgx.__name__)
@pass_config
def cli(config, debug):
"""
Knowledge Graph Exchange
"""
config.debug = debug

@cli.command()
@click.option('--input-type', type=str, help='Extention type of input files: ' + _file_types)
@click.option('--output-type', type=str, help='Extention type of output files: ' + _file_types)
@click.argument('input', nargs=-1, type=click.Path(exists=False))
@click.argument('output', type=click.Path(exists=False))
@pass_config
def dump(config, input, output, input_type, output_type):
"""\b
Transforms a knowledge graph from one representation to another
INPUT : any number of files or endpoints
OUTPUT : the output file
"""
try:
_dump(input, output, input_type, output_type)
except Exception as e:
if config.debug:
raise e
else:
raise click.ClickException(e)

def _dump(input, output, input_type, output_type):
if output_type is None:
output_type = _get_type(output)

if input_type is None:
input_types = [_get_type(i) for i in input]
for t in input_types:
if input_types[0] != t:
raise Exception("""Each input file must have the same file type.
Try setting the --input-type parameter to enforce a single
type."""
)
input_type = input_types[0]

input_transformer = _transformers.get(input_type)

if input_transformer is None:
raise Exception('Input does not have a recognized type: ' + _file_types)

t = input_transformer()

for i in input:
t.parse(i)

t.report()

output_transformer = _transformers.get(output_type)

if output_transformer is None:
raise Exception('Output does not have a recognized type: ' + _file_types)

kwargs = {
'tmp_dir' : click.get_app_dir(kgx.__name__),
'extention' : output_type
}

w = output_transformer(t)
result_path = w.save(output, **kwargs)

if result_path is not None and os.path.isfile(result_path):
click.echo("File created at: " + result_path)
elif os.path.isfile(output):
click.echo("File created at: " + output)
else:
click.echo("Could not create file.")


def _get_type(filename):
for t in _transformers.keys():
if filename.endswith('.' + t):
return t
else:
return None
2 changes: 1 addition & 1 deletion kgx/nx_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ class GraphMLTransformer(NetworkxTransformer):
I/O for graphml
"""

def save(self, path):
def save(self, path, **kwargs):
nx.write_graphml(self.graph, path)
36 changes: 35 additions & 1 deletion kgx/pandas_transformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import pandas as pd
import logging
import os
import tarfile

from .transformer import Transformer

from typing import Dict, List
Expand Down Expand Up @@ -72,8 +75,39 @@ def order_cols(self, cols: List[str]):
cols.remove(c)
return cols2 + cols

def save(self, filename: str, tmp_dir='.', extention='csv', ziptype='tar', zipmode='w', **kwargs):
"""
Write two CSV/TSV files representing the node set and edge set of a
graph, and zip them in a .tar file. The two files will be written to a
temporary directory if provided in the kwargs, but they will not be
deleted after use. Each use of this method will overwrite the two files.
"""

if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)

edge_file_name = 'edges.' + extention
node_file_name = 'nodes.' + extention

edge_file_path = os.path.join(tmp_dir, edge_file_name)
node_file_path = os.path.join(tmp_dir, node_file_name)

self.export_nodes().to_csv(node_file_path, index=False)
self.export_edges().to_csv(edge_file_path, index=False)

if not ziptype.startswith('.'):
ziptype = '.' + ziptype

if not filename.endswith(ziptype):
filename += ziptype

with tarfile.open(name=filename, mode=zipmode) as tar:
tar.add(name=node_file_path, arcname=node_file_name)
tar.add(name=edge_file_path, arcname=edge_file_name)

return filename

def save(self, filename: str, type='n', **args):
def save_csv(self, filename: str, type='n', **args):
"""
Write a CSV/TSV
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ prefixcommons>=0.1.4
pip>=9.0.1
networkx==1.11
SPARQLWrapper==1.8.0
pandas>=0.0
pandas<0.21
pytest>=0.0
mypy>=0.0
pystache>=0.0
pytest_logging>=0.0
rdflib>=0.0
click==6.7
12 changes: 12 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from setuptools import setup

setup(
name='Knowledge Graph Exchange',
version='0.0.1',
packages=['kgx'],
install_requires=['Click'],
entry_points="""
[console_scripts]
kgx=kgx:cli
"""
)

0 comments on commit 09750a2

Please sign in to comment.