Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Command line wrapper #22

Merged
merged 11 commits into from
May 2, 2018
52 changes: 47 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,52 @@ the

For additional background see the [Translator Knowledge Graph Drive](http://bit.ly/tr-kg)

## Command Line Usage
## Installation
```
pip install -r requirements.txt
python setup.py install
```

Coming soon. For now see the [tests](tests/) for example
## Command Line Usage
Use the `--help` flag to get help. Right now there is a single command:
```
Usage: kgx dump [OPTIONS] [INPUT]... OUTPUT

Transforms a knowledge graph from one representation to another
INPUT : any number of files or endpoints
OUTPUT : the output file

Options:
--input-type TEXT Extention type of input files: ttl, json, csv, rq, tsv,
graphml
--output-type TEXT Extention type of output files: ttl, json, csv, rq, tsv,
graphml
--help Show this message and exit.
```

CSV/TSV representation require two files, one that represents the vertex set and
one for the edge set. JSON, TTL, and GRAPHML files represent a whole graph in a
single file. For this reason when creating CSV/TSV representation we will zip
the resulting files in a .tar file.

The format will be inferred from the file extention. But if this cannot be done
then the `--input-type` and `--output-type` flags are useful to tell the program
what formats to use. Currently not all conversions are supported.

Here are some examples that mirror the [tests](tests/):

```
$ kgx dump --output-type=csv tests/resources/x1n.csv tests/resources/x1e.csv target/x1out
File created at: target/x1out.tar
$ kgx dump tests/resources/x1n.csv tests/resources/x1e.csv target/x1n.graphml
File created at: target/x1n.graphml
$ kgx dump tests/resources/monarch/biogrid_test.ttl target/bgcopy.csv
File created at: target/bgcopy.csv.tar
$ kgx dump tests/resources/monarch/biogrid_test.ttl target/x1n.graphml
File created at: target/x1n.graphml
$ kgx dump tests/resources/monarch/biogrid_test.ttl target/x1n.json
File created at: target/x1n.json
```

## Internal Representation

Expand All @@ -23,8 +66,8 @@ standard](http://bit.ly/tr-kg-standard), briefly summarized here:
* id : required
* name : string
* category : string. broad high level type. Corresponds to label in neo4j
* extensible other properties, depending on
* [Edges](https://biolink.github.io/biolink-model/docs/Association.html)
* extensible other properties, depending on
* [Edges](https://biolink.github.io/biolink-model/docs/Association.html)
* subject : required
* predicate : required
* object : required
Expand Down Expand Up @@ -53,4 +96,3 @@ Intended to support
Neo4j implements property graphs out the box. However, some
implementations use reification nodes. The transform should allow for
de-reification.

2 changes: 2 additions & 0 deletions kgx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
from .sparql_transformer import SparqlTransformer
from .rdf_transformer import ObanRdfTransformer
from .json_transformer import JsonTransformer

from .kgx import cli
104 changes: 104 additions & 0 deletions kgx/kgx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import kgx
import click
import os

_transformers = {
'csv' : kgx.PandasTransformer,
'tsv' : kgx.PandasTransformer,
'graphml' : kgx.GraphMLTransformer,
'ttl' : kgx.ObanRdfTransformer,
'json' : kgx.JsonTransformer,
'rq' : kgx.SparqlTransformer
}

_file_types = ', '.join(_transformers.keys())

class Config(object):
def __init__(self):
self.debug = False

pass_config = click.make_pass_decorator(Config, ensure=True)

@click.group()
@click.option('--debug', is_flag=True, help='Prints the stack trace if error occurs')
@click.version_option(version=kgx.__version__, prog_name=kgx.__name__)
@pass_config
def cli(config, debug):
"""
Knowledge Graph Exchange
"""
config.debug = debug

@cli.command()
@click.option('--input-type', type=str, help='Extention type of input files: ' + _file_types)
@click.option('--output-type', type=str, help='Extention type of output files: ' + _file_types)
@click.argument('input', nargs=-1, type=click.Path(exists=False))
@click.argument('output', type=click.Path(exists=False))
@pass_config
def dump(config, input, output, input_type, output_type):
"""\b
Transforms a knowledge graph from one representation to another
INPUT : any number of files or endpoints
OUTPUT : the output file
"""
try:
_dump(input, output, input_type, output_type)
except Exception as e:
if config.debug:
raise e
else:
raise click.ClickException(e)

def _dump(input, output, input_type, output_type):
if output_type is None:
output_type = _get_type(output)

if input_type is None:
input_types = [_get_type(i) for i in input]
for t in input_types:
if input_types[0] != t:
raise Exception("""Each input file must have the same file type.
Try setting the --input-type parameter to enforce a single
type."""
)
input_type = input_types[0]

input_transformer = _transformers.get(input_type)

if input_transformer is None:
raise Exception('Input does not have a recognized type: ' + _file_types)

t = input_transformer()

for i in input:
t.parse(i)

t.report()

output_transformer = _transformers.get(output_type)

if output_transformer is None:
raise Exception('Output does not have a recognized type: ' + _file_types)

kwargs = {
'tmp_dir' : click.get_app_dir(kgx.__name__),
'extention' : output_type
}

w = output_transformer(t)
result_path = w.save(output, **kwargs)

if result_path is not None and os.path.isfile(result_path):
click.echo("File created at: " + result_path)
elif os.path.isfile(output):
click.echo("File created at: " + output)
else:
click.echo("Could not create file.")


def _get_type(filename):
for t in _transformers.keys():
if filename.endswith('.' + t):
return t
else:
return None
2 changes: 1 addition & 1 deletion kgx/nx_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ class GraphMLTransformer(NetworkxTransformer):
I/O for graphml
"""

def save(self, path):
def save(self, path, **kwargs):
nx.write_graphml(self.graph, path)
36 changes: 35 additions & 1 deletion kgx/pandas_transformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import pandas as pd
import logging
import os
import tarfile

from .transformer import Transformer

from typing import Dict, List
Expand Down Expand Up @@ -72,8 +75,39 @@ def order_cols(self, cols: List[str]):
cols.remove(c)
return cols2 + cols

def save(self, filename: str, tmp_dir='.', extention='csv', ziptype='tar', zipmode='w', **kwargs):
"""
Write two CSV/TSV files representing the node set and edge set of a
graph, and zip them in a .tar file. The two files will be written to a
temporary directory if provided in the kwargs, but they will not be
deleted after use. Each use of this method will overwrite the two files.
"""

if not os.path.exists(tmp_dir):
os.mkdir(tmp_dir)

edge_file_name = 'edges.' + extention
node_file_name = 'nodes.' + extention

edge_file_path = os.path.join(tmp_dir, edge_file_name)
node_file_path = os.path.join(tmp_dir, node_file_name)

self.export_nodes().to_csv(node_file_path, index=False)
self.export_edges().to_csv(edge_file_path, index=False)

if not ziptype.startswith('.'):
ziptype = '.' + ziptype

if not filename.endswith(ziptype):
filename += ziptype

with tarfile.open(name=filename, mode=zipmode) as tar:
tar.add(name=node_file_path, arcname=node_file_name)
tar.add(name=edge_file_path, arcname=edge_file_name)

return filename

def save(self, filename: str, type='n', **args):
def save_csv(self, filename: str, type='n', **args):
"""
Write a CSV/TSV

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ prefixcommons>=0.1.4
pip>=9.0.1
networkx==1.11
SPARQLWrapper==1.8.0
pandas>=0.0
pandas<0.21
pytest>=0.0
mypy>=0.0
pystache>=0.0
pytest_logging>=0.0
rdflib>=0.0
click==6.7
12 changes: 12 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from setuptools import setup

setup(
name='Knowledge Graph Exchange',
version='0.0.1',
packages=['kgx'],
install_requires=['Click'],
entry_points="""
[console_scripts]
kgx=kgx:cli
"""
)