Skip to content

Commit

Permalink
add options for histogram; support more input format
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangrengang committed Sep 25, 2024
1 parent e704d6d commit 574ccb4
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 21 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,12 @@ soi dotplot -s Populus_trichocarpa-Salix_dunnii.collinearity.gz \
# filter orthologous synteny
soi filter -s Populus_trichocarpa-Salix_dunnii.collinearity.gz -o OrthoFinder/OrthoFinder/Results_*/ \
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho.test
# or (alter input format)
soi filter -s Populus_trichocarpa-Salix_dunnii.collinearity.gz -o Populus_trichocarpa-Salix_dunnii.orthologs.gz \
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho.test
# compare with the expected output: no output via `diff`
diff Populus_trichocarpa-Salix_dunnii.collinearity.ortho Populus_trichocarpa-Salix_dunnii.collinearity.ortho.test
```
### Example output dot plots ###
Expand Down
7 changes: 4 additions & 3 deletions example_data/example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ soi dotplot -s Populus_trichocarpa-Salix_dunnii.collinearity.gz \

# filter
soi filter -s Populus_trichocarpa-Salix_dunnii.collinearity.gz -o OrthoFinder/OrthoFinder/Results_*/ \
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho.test
# or (alter input format)
soi filter -s Populus_trichocarpa-Salix_dunnii.collinearity.gz -o Populus_trichocarpa-Salix_dunnii.orthologs.gz \
-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho

-c 0.6 > Populus_trichocarpa-Salix_dunnii.collinearity.ortho.test
# compare with the expected output
diff Populus_trichocarpa-Salix_dunnii.collinearity.ortho Populus_trichocarpa-Salix_dunnii.collinearity.ortho.test
60 changes: 59 additions & 1 deletion soi/OrthoFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,63 @@ def mean_dist(inner_g1, inner_g2, g1_idx, g2_idx):
print('\t'.join(line), file=fout)
for line in sorted(lines):
print('\t'.join(map(str, line)), file=fout)

class SonicParanoid:
def __init__(self, ResultsDir):
self.ResultsDir = ResultsDir
self.Orthogroups = '{}/ortholog_groups/ortholog_groups.tsv'.format(ResultsDir)
self.Orthologues = glob.glob('{}/species_to_species_orthologs/*/*'.format(ResultsDir))
def get_homologs(self, sps=None, sp1=None, sp2=None, **kargs):
'''获取成对的orthologs'''
if sp1 is not None or sp2 is not None:
sps = {sp1, sp2}
if sps is not None:
orthoFiles = []
for sp1, sp2 in itertools.permutations(sps, 2):
orthoFiles += ['{}/species_to_species_orthologs/{}/{}-{}'.format(self.ResultsDir, sp1, sp1, sp2)]
else:
orthoFiles = self.Orthologues
ortho_pairs = set([])
idx = 0
for orthoFile in orthoFiles:
idx += 1
for line in SonicParanoidOrthologs(orthoFile):
Genes_1 = line.OrthoA
Genes_2 = line.OrthoB
for g1, g2 in itertools.product(Genes_1, Genes_2):
if (g2, g1) in ortho_pairs:
continue
ortho_pairs.add( (g1, g2) ) #orthologs
for Genes in [Genes_1, Genes_2]:
if len(Genes) < 2:
continue
for g1, g2 in itertools.combinations(Genes, 2):
if (g2, g1) in ortho_pairs:
continue
ortho_pairs.add( (g1, g2) ) # inparalogs
return ortho_pairs
def test_sonicparanoid(ResultsDir=sys.argv[2]):
for g1, g2 in SonicParanoid(ResultsDir).get_homologs():
print(g1, g2)
class SonicParanoidOrthologs:
def __init__(self, orthologs):
self.orthologs = orthologs
def __iter__(self):
return self._parse()
def _parse(self):
i = 0
for line in open(self.orthologs):
i += 1
if i == 1:
continue
yield SonicParanoidOrthologsLine(line)
class SonicParanoidOrthologsLine:
def __init__(self, line):
Size, Relations, OrthoA, OrthoB = line.strip().split('\t')
self.OrthoA = self.parse_ortho(OrthoA)
self.OrthoB = self.parse_ortho(OrthoB)
def parse_ortho(self, Ortho):
return [val for i, val in enumerate(Ortho.split()) if i %2 ==0]

class OrthoFinder:
def __init__(self, ResultsDir):
self.ResultsDir = ResultsDir
Expand Down Expand Up @@ -2301,6 +2357,8 @@ def main():
try: species = sys.argv[3]
except IndexError: species=None
OrthoFinder(OFdir).to_wgdi(parse_species(species), **kargs)
elif subcommand == 'test_spd':
test_sonicparanoid()
else:
raise ValueError('Unknown command: {}'.format(subcommand))

Expand Down
22 changes: 16 additions & 6 deletions soi/dot_plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def dotplot_args(parser):

group_dot.add_argument('--xlabel', type=str, default=None, help="x label for dot plot. [default=%(default)s]")
group_dot.add_argument('--ylabel', type=str, default=None, help="y label for dot plot. [default=%(default)s]")
group_dot.add_argument('--figsize', metavar='NUM', type=float, nargs='+', default=[18], help="figure size (width [height]) [default=%(default)s]")
group_dot.add_argument('--figsize', metavar='NUM', type=float, nargs='+', default=[16], help="figure size (width [height]) [default=%(default)s]")
group_dot.add_argument('--fontsize', metavar='NUM', type=float, default=10, help="font size of chromosome labels [default=%(default)s]")
group_dot.add_argument('--dotsize', metavar='NUM', type=float, default=1, dest='point_size', help="dot size [default=%(default)s]")

Expand All @@ -73,6 +73,7 @@ def dotplot_args(parser):
group_ks.add_argument('--method', metavar='STR', type=str, default='NG86', help='Ks calculation method [default=%(default)s]')
group_ks.add_argument('--lower-ks', metavar='Ks', type=float, default=None, help="lower limit of median Ks. [default=%(default)s]")
group_ks.add_argument('--upper-ks', metavar='Ks', type=float, default=None, help="upper limit of median Ks. [default=%(default)s]")
group_ks.add_argument('--output-hist', action='store_true', default=False, help="output the data for histogram plot. [default=%(default)s]")
# group_ks.add_argument('--clip-ks', action='store_true', default=None, help="clip ks > max-ks. [default=%(default)s]")
# group_ks.add_argument('--hist-ylim', type=float, default=None, help="max y axis of Ks histgram. [default=%(default)s]")
# group_ks.add_argument('--yn00', action='store_true', default=False, help='turn to YN00[default=%(default)s]')
Expand Down Expand Up @@ -203,7 +204,7 @@ def _remove_prefix(labels):
return labels
def plot_blocks(blocks, outplots, ks=None, max_ks=None, ks_hist=False, ks_cmap=None, clip_ks=None, min_block=None, ks_step=0.02,
xlabels=None, ylabels=None, xpositions=None, ypositions=None, xelines=None, yelines=None, xlim=None, ylim=None,
figsize=18, fontsize=10, point_size=0.8, xclines=None, yclines=None, plot_bin=None,
figsize=18, fontsize=10, point_size=0.8, xclines=None, yclines=None, plot_bin=None, output_hist=False,
xoffset=None, yoffset=None, xbars=None, ybars=None, gff=None, gene_axis=None, xbarlab=True, ybarlab=True,
hist_ylim=None, xlabel=None, ylabel=None, remove_prefix=True, number_plots=True, same_sp=False,
ploidy=False, ploidy_data=None, ortholog_graph=None, of_color=False, homology=False, **kargs
Expand Down Expand Up @@ -377,12 +378,16 @@ def plot_blocks(blocks, outplots, ks=None, max_ks=None, ks_hist=False, ks_cmap=N
ax = plt.subplot2grid((6,5),(5,0), colspan=3)
else:
ax = plt.subplot2grid((6,5),(5,0), colspan=5)
bins = int(max_ks/ks_step)
bins = int((min(max_ks, max(allKs)) - max(0, min(allKs)))/ks_step)
_xlabel = tlabel #'OrthoIndex' if of_color else 'Ks'
#print min(allKs), max(allKs)
# ylabel = ' of syntenic gene pairs' if homology else ' of syntenic gene pairs'
_ylabel = ' of gene pairs'
_histgram(ax, allKs, cmap=cmap, xlim = max_ks, ylim=hist_ylim, bins=bins, normed=False, xlabel=_xlabel, ylabel=_ylabel, fontsize=xcsize)
if output_hist:
output_hist = os.path.splitext(outplots[0])[0] + '.histo'
logger.info('Output histogram data: {}'.format(output_hist))
_histgram(ax, allKs, cmap=cmap, xlim = max_ks, ylim=hist_ylim, bins=bins, normed=False, xlabel=_xlabel,
ylabel=_ylabel, output_hist=output_hist, fontsize=xcsize)
label = '(b)'
if number_plots:
plot_label(ax, label, fontsize=lsize)
Expand Down Expand Up @@ -466,7 +471,7 @@ def plot_fold(ax, titles, ref_coord_paths, ref_coord_graph, qry_coord_graph, rq_
data = [np.array(sorted(d_fold.items()))]
#print >>sys.stderr, kargs
plot_bars(data, titles, ax=ax, ncol=1, nrow=1, **kargs)
def _histgram(ax, allKs, cmap=None, xlim=None, ylim=None, bins=100, normed=False, xlabel='Ks', ylabel=' of syntenic gene pairs', fontsize=10):
def _histgram(ax, allKs, cmap=None, xlim=None, ylim=None, bins=100, normed=False, xlabel='Ks', ylabel=' of syntenic gene pairs', fontsize=None, output_hist=False):
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
Expand All @@ -487,6 +492,10 @@ def _histgram(ax, allKs, cmap=None, xlim=None, ylim=None, bins=100, normed=False
if X <= xlim:
Xs.append((bins[i] + bins[i+1])/2)
Ys.append(n[i])
if output_hist:
with open(output_hist, 'w') as f:
for dat in [Xs, Ys]:
print('\t'.join(map(str, dat)), file=f)
if ylim is None:
ylim = 1.2*max(Ys[:-1])
line = ax.plot(Xs, Ys, ls='--', c='grey')
Expand All @@ -498,7 +507,8 @@ def _histgram(ax, allKs, cmap=None, xlim=None, ylim=None, bins=100, normed=False
ax.set_xlim(0, xlim)
ax.set_ylim(0, ylim)
ax.set_xlabel(xlabel, fontsize=fontsize*1.1) # Ks/OrthoIndex; fontsize
ax.set_ylabel(ylabel, fontsize=fontsize*0.8)
# ax.set_ylabel(ylabel, fontsize=fontsize*0.8)
ax.set_ylabel(ylabel)
ax.minorticks_on()
cbar = plt.colorbar(ax=ax)
return xlim, ylim
Expand Down
68 changes: 60 additions & 8 deletions soi/mcscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from Bio import SeqIO, Phylo
from lazy_property import LazyWritableProperty as lazyproperty

from .OrthoFinder import catAln, format_id_for_iqtree, OrthoMCLGroup, OrthoMCLGroupRecord, OrthoFinder, parse_species, exists_and_size_gt_zero
from .OrthoFinder import catAln, format_id_for_iqtree, OrthoMCLGroup, OrthoMCLGroupRecord, OrthoFinder,SonicParanoid, parse_species, exists_and_size_gt_zero
from .small_tools import mkdirs, flatten, test_s, test_f, parse_kargs, rmdirs, lazy_decode
from .RunCmdsMP import run_cmd, run_job, logger
from .small_tools import open_file as open
Expand Down Expand Up @@ -50,16 +50,19 @@ def ichr(self):
return int(re.compile(r'[^\d\s]+(\d+)').match(self.chr).groups()[0])

class KaKs():
def __init__(self, info, fdtv=False, yn00=False, wgdi=False, method='NG86', **kargs):
def __init__(self, info, kaks=False, fdtv=False, yn00=False, wgdi=False, method='NG86', **kargs):
self.info = info
if yn00:
self.parse_yn00(method=method)
elif wgdi:
self.parse_wgdi(method=method)
elif fdtv:
self.parse_4dtv()
else:
elif kaks:
self.parse_ks()
else:
print('unrecognized Ks format')
#print(vars())
self.parse_pair()
def parse_4dtv(self):
(Sequence, fD_Sites, Identical_Sites, TS_Sites, TV_Sites, fDS, fDTS, fDTV, Corrected_4DTV) = self.info
Expand Down Expand Up @@ -125,13 +128,16 @@ def __init__(self, kaks, **kargs):
def __iter__(self):
return self._parse()
def _parse(self):
self.kargs['wgdi'] = True
for line in open(self.kaks):
line = lazy_decode(line)
temp = line.rstrip().split()
# print >> sys.stderr, temp
#print(temp)
if temp[0] in {'Sequence', 'id1'}:
if temp[1] == 'dS-YN00':
if temp[1] == 'Method':
self.kargs['kaks'] = True
elif temp[1] == 'dS-YN00':
self.kargs['yn00'] = True
elif temp[2] == 'ka_NG86':
self.kargs['wgdi'] = True
Expand Down Expand Up @@ -222,8 +228,9 @@ def _parse_list(self, _collinearities):
def _parse(self):
if self.orthologs is not None:
ortholog_pairs = set(XOrthology(self.orthologs, **self.kargs))
#logger.info(list(ortholog_pairs)[:10])
logger.info('\t{} homologous pairs'.format(len(ortholog_pairs)))
logger.info('parsing {} collinearity files: {}...'.format(
logger.info('parsing {} collinearity files: {} ...'.format(
len(self.collinearities), self.collinearities[:3]))
nblock, ngene = 0, 0
for collinearity in self.collinearities:
Expand All @@ -244,7 +251,40 @@ def _parse(self):
rc.ton = len(ortholog_pairs) # all syntenic orthologs
rc.ortholog_pairs = ortholog_pairs
yield rc
#logger.info(list(pairs)[:10])
logger.info('\t{} collinearity blocks, {} collinearity genes'.format(nblock, ngene))
class Xpairs:
def __init__(self, ortholog):
self.ortholog = ortholog
def __iter__(self):
return self._parse()
def _parse(self):
if os.path.isdir(self.ortholog): # orthofinder
for pair in OrthoFinder(self.ortholog).get_homologs(**self.kargs):
yield Pair(*pair)
else: # collinearity & homolog pairs
for rc in Collinearity(self.ortholog):
for pair in rc.pairs:
yield Pair(*pair)

def evaluate_orthology(ref, qry):
ref_pairs = set(Xpairs(ref))
AP = len(ref_pairs)
logger.info('{} pairs in {}'.format(AP, ref))
qry_pairs = set(Xpairs(qry))
AD = len(qry_pairs)
logger.info('{} pairs in {}'.format(AD, qry))
TP = len(qry_pairs & ref_pairs)
FP = AD - TP
FN = AP - TP
precision = 1.0 * TP/ (TP+FP)
recall = 1.0 * TP/ (TP+FN)
f1_score = 2* precision * recall / (precision + recall)
logger.info('TP: {}; FP: {}; FN: {}'.format(TP, FP, FN))
logger.info('Precision: {}; Recall: {}; F1 Score: {}'.format(precision, recall, f1_score))
line = [qry, precision, recall, f1_score]
print('\t'.join(map(str, line)))

class XOrthology:
def __init__(self, orthologs, **kargs):
self.orthologs = self._parse_list(orthologs)
Expand All @@ -259,9 +299,10 @@ def _parse_list(self, _orthologs):
return _orthologs
def _parse(self):
for ortholog in self.orthologs:
logger.info('parsing {}...'.format(ortholog))
logger.info('parsing {} ...'.format(ortholog))
if os.path.isdir(ortholog): # orthofinder
for pair in OrthoFinder(ortholog).get_homologs(**self.kargs):
parser = SonicParanoid if os.path.isdir(ortholog+'/species_to_species_orthologs') else OrthoFinder
for pair in parser(ortholog).get_homologs(**self.kargs):
yield Pair(*pair)
else: # orthomcl or similar format
for rc in Pairs(ortholog, parser=Pair):
Expand Down Expand Up @@ -326,7 +367,8 @@ def identify_orthologous_blocks(collinearities=None, orthologs=None, fout=sys.st
pre_nb, pre_ng, post_nb, 1.0*post_nb/pre_nb, post_ng, 1.0*post_ng/pre_ng))
logger.info('Orthology: Pre-filter: {} pairs; Post-filter: {} ({:.1%}) pairs.'.format(
rc.ton, post_no, 1.0*post_no/rc.ton))
logger.info('Post-filter mean OrthoIndex: {:.2f}'.format(total_oi/post_ng))
if post_ng > 0:
logger.info('Post-filter mean OrthoIndex: {:.2f}'.format(total_oi/post_ng))
if homo_class is not None:
out_class.close()
if out_stats is None:
Expand Down Expand Up @@ -374,6 +416,11 @@ def write(self, f, ):
f.write(self.header)
f.write(self.block)
def parse(self):
start = lazy_decode(open(self.collinearity).read(1))
if start != '#' and not self.homology:
self.homology = True
#logger.info(lazy_decode(open(self.collinearity).read(10)))
#logger.info('self.homology: {}'.format( self.homology))
if not self.homology:
lines = []
head = []
Expand Down Expand Up @@ -1057,6 +1104,8 @@ def __str__(self):
return '{}-{}'.format(*self.pair)
def __format__(self):
return str(self)
def __repr__(self):
return str(self)
@property
def key(self):
return tuple(sorted(self.pair))
Expand Down Expand Up @@ -3038,6 +3087,9 @@ def main():
elif subcmd == 'get_ks':
ksfile = sys.argv[2]
get_ks(ksfile, pairfile=sys.stdin, outks=sys.stdout, outpair=sys.stderr, **kargs)
elif subcmd == 'eval_ortho':
ref, qry = sys.argv[2:4]
evaluate_orthology(ref, qry)
else:
raise ValueError('Unknown sub command: {}'.format(subcmd))
if __name__ == '__main__':
Expand Down
3 changes: 2 additions & 1 deletion soi/small_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@ def combine_tabs_2xls(table_files, xls_file = None, sheets = None):
j += 1
i += 1
wb.save(xls_file)

def get_suffix(infile):
return os.path.splitext(infile)[-1]
def open_file(infile, mode='r'):
suffix = os.path.splitext(infile)[-1]
if suffix == '.gz':
Expand Down

0 comments on commit 574ccb4

Please sign in to comment.