Skip to content

Commit

Permalink
Merge pull request #1034 from greenelab/ms-stats-extres
Browse files Browse the repository at this point in the history
#1019 for plotting project growth, but on external-resources branch
  • Loading branch information
agitter authored Sep 13, 2021
2 parents 9604975 + bb4e509 commit 2c85371
Show file tree
Hide file tree
Showing 8 changed files with 2,611 additions and 0 deletions.
134 changes: 134 additions & 0 deletions analyze-ms-stats/calc-manuscript-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
import pandas as pd
import matplotlib
import argparse
import time
from pathlib import Path
import multiprocessing
import subprocess

def analyze_commit(commit):
"""Access files and data in variables.json associated with each commit
Accepts commit ID as string
Returns list of 5 statistics"""
variablesCommand = "git show " + commit + ":./variables.json"
try:
variables = json.loads(subprocess.getoutput(variablesCommand))
except json.decoder.JSONDecodeError:
exit(commit + " not found")

date = variables['pandoc']['date-meta']
clean_date = variables['manubot']['date']
num_authors = len(variables['manubot']['authors'])
word_count = variables['manubot']['manuscript_stats']['word_count']

# Access files and data in references.json associated with each commit
referencesCommand = "git show " + commit + ":./references.json"
try:
references = json.loads(subprocess.getoutput(referencesCommand))
except json.decoder.JSONDecodeError:
exit(commit + "not found")
num_ref = len(references)

return ({"stats_date": date,
"stats_clean_date": clean_date,
"stats_num_authors": num_authors,
"stats_num_words": word_count,
"stats_num_references": num_ref})

def main(args):
'''Extract statistics from the output branch log'''

print("Using {0} CPUs".format(multiprocessing.cpu_count()))

# Read in list of all commits on this branch
with open(args.commit_list, "r") as commitFile:
commits = [c.strip() for c in commitFile.read().splitlines()]

# If this analysis has been run before, load in the list of commits analyzed
# and only analyze new commits
# Assumes no commits will be added retrospectively (to take advantage of linearity)
priorData = None
if Path(args.output_table).is_file():
priorData = pd.read_csv(args.output_table)
oldCommits = priorData["commit"].tolist()
priorData = priorData.set_index("commit")

if len(commits) > len(oldCommits):
start_old = commits.index(oldCommits[0])
commits = commits[:start_old]
print("{0} new commits".format(len(commits)))
else:
exit("No new commits")

# Access the variables.json and references.json files associated with each commit and store in dictionary
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
commitData = dict(zip(commits, pool.map(analyze_commit, commits)))
pool.close()
pool.join()

# Turn commitData to df, then flip to be in chronological order
growthData = pd.DataFrame.from_dict(commitData, orient="index")
growthData = growthData.rename(columns={"stats_date": "Date",
"stats_clean_date": "Clean_date",
"stats_num_authors": "Authors",
"stats_num_words": "Word Count",
"stats_num_references": "References"})
# Append onto table of previous commit data, if this exists
if priorData is not None:
growthData = growthData.append(priorData)

# Cache commit data for future updates
growthData.to_csv(args.output_table, index_label="commit")
print('Wrote {}'.format(args.output_table))

# Prepare data to graph
graphData = growthData.set_index("Date")
graphData = graphData[::-1]

# Plot the data
axes = graphData.plot(kind='line', linewidth=2, subplots=True)
for ax in axes:
ax.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(
lambda x, p: format(int(x), ',')))
ax.set_ylabel('Count')
ax.set_ylim(bottom=0)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.minorticks_off()
ax.grid(color="lightgray")

ax.figure.savefig(args.output_figure + '.png', dpi=300, bbox_inches="tight")
ax.figure.savefig(args.output_figure + '.svg', bbox_inches="tight")

print('Wrote {0}.png and {1}.svg'.format(args.output_figure, args.output_figure))

# Write json output file
manuscript_stats = commitData[commits[0]]
for item in ["stats_num_authors", "stats_num_words", "stats_num_references"]:
manuscript_stats[item] = str(manuscript_stats[item])
with open(args.output_json, 'w') as out_file:
json.dump(manuscript_stats, out_file, indent=2, sort_keys=True)
print('Wrote {0}'.format(args.output_json))

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('commit_list',
help='File containing a list of all commits on output branch, one per line',
type=str)
parser.add_argument('output_json',
help='Path of the JSON file with extracted statistics',
type=str)
parser.add_argument('output_figure',
help='Path of the output figure for manuscript ' \
'statistics without file type extension. Will be saved ' \
'as .png and .svg.',
type=str)
parser.add_argument('output_table',
help='Path of the output table used to generate ' \
'figures ',
type=str)
args = parser.parse_args()
main(args)
22 changes: 22 additions & 0 deletions analyze-ms-stats/calc-manuscript-stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -e

# Calculate growth statistics for manuscript based on Manubot files

# Generate list of all commits in history of output branch
echo "Generate log for output branch"
git log --pretty=format:"%h" --first-parent output > analyze-ms-stats/output-commits.txt

# Define input and output files
COMMIT_LIST=analyze-ms-stats/output-commits.txt
OUTPUT_JSON=analyze-ms-stats/manuscript_stats.json
OUTPUT_FIG=analyze-ms-stats/manuscript_stats
OUTPUT_TABLE=analyze-ms-stats/commitData.csv

# Run python script
echo "Run python script to analyze manuscript growth"
python analyze-ms-stats/calc-manuscript-stats.py $COMMIT_LIST $OUTPUT_JSON $OUTPUT_FIG $OUTPUT_TABLE

# Clean up temporary files
echo "Clean up temporary files"
rm analyze-ms-stats/output-commits.txt
Loading

0 comments on commit 2c85371

Please sign in to comment.