paper: incorporate editor comments

brocklab · Sep 30, 2024 · 26fe851 · 26fe851
1 parent 8b8e415
commit 26fe851
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 16 deletions.
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -29,20 +29,23 @@ @article{chen2018
   abstract = {Quality control and preprocessing of FASTQ files are essential to providing clean data for downstream analysis. Traditionally, a different tool is used for each operation, such as quality control, adapter trimming and quality filtering. These tools are often insufficiently fast as most are developed using high-level programming languages (e.g. Python and Java) and provide limited multi-threading support. Reading and loading data multiple times also renders preprocessing slow and I/O inefficient.We developed fastp as an ultra-fast FASTQ preprocessor with useful quality control and data-filtering features. It can perform quality control, adapter trimming, quality filtering, per-read quality pruning and many other operations with a single scan of the FASTQ data. This tool is developed in C++ and has multi-threading support. Based on our evaluation, fastp is 2--5 times faster than other FASTQ preprocessing tools such as Trimmomatic or Cutadapt despite performing far more operations than similar tools.The open-source code and corresponding instructions are available at https://github.com/OpenGene/fastp.}
 }
 
-@article{dhimolea2021,
-  title = {An {{Embryonic Diapause-like Adaptation}} with {{Suppressed Myc Activity Enables Tumor Treatment Persistence}}},
-  author = {Dhimolea, Eugen and {de Matos Simoes}, Ricardo and Kansara, Dhvanir and Al'Khafaji, Aziz and Bouyssou, Juliette and Weng, Xiang and Sharma, Shruti and Raja, Joseline and Awate, Pallavi and Shirasaki, Ryosuke and Tang, Huihui and Glassner, Brian J. and Liu, Zhiyi and Gao, Dong and Bryan, Jordan and Bender, Samantha and Roth, Jennifer and Scheffer, Michal and Jeselsohn, Rinath and Gray, Nathanael S. and Georgakoudi, Irene and Vazquez, Francisca and Tsherniak, Aviad and Chen, Yu and Welm, Alana and Duy, Cihangir and Melnick, Ari and Bartholdy, Boris and Brown, Myles and Culhane, Aedin C. and Mitsiades, Constantine S.},
+@article{emert2021,
+  title = {Variability within Rare Cell States Enables Multiple Paths toward Drug Resistance},
+  author = {Emert, Benjamin L. and Cote, Christopher J. and Torre, Eduardo A. and Dardani, Ian P. and Jiang, Connie L. and Jain, Naveen and Shaffer, Sydney M. and Raj, Arjun},
   year = {2021},
-  month = feb,
-  journal = {Cancer Cell},
+  month = jul,
+  journal = {Nature Biotechnology},
   volume = {39},
-  number = {2},
-  pages = {240-256.e11},
-  issn = {1535-6108},
-  doi = {10.1016/j.ccell.2020.12.002},
-  urldate = {2024-04-05},
-  abstract = {Treatment-persistent residual tumors impede curative cancer therapy. To understand this cancer cell state we generated models of treatment persistence that simulate the residual tumors. We observe that treatment-persistent tumor cells in organoids, xenografts, and cancer patients adopt a distinct and reversible transcriptional program resembling that of embryonic diapause, a dormant stage of suspended development triggered by stress and associated with suppressed Myc activity and overall biosynthesis. In cancer cells, depleting Myc or inhibiting Brd4, a Myc transcriptional co-activator, attenuates drug cytotoxicity through a dormant diapause-like adaptation with reduced apoptotic priming. Conversely, inducible Myc upregulation enhances acute chemotherapeutic activity. Maintaining residual cells in dormancy after chemotherapy by inhibiting Myc activity or interfering with the diapause-like adaptation by inhibiting cyclin-dependent kinase 9 represent potential therapeutic strategies against chemotherapy-persistent tumor cells. Our study demonstrates that cancer co-opts a mechanism similar to diapause with adaptive inactivation of Myc to persist during treatment.},
-  keywords = {adaptation to stress,breast cancer,cancer,CDK9,CRISPR,diapause,drug persistence,MYC,prostate cancer,residual tumor}
+  number = {7},
+  pages = {865--876},
+  publisher = {Nature Publishing Group},
+  issn = {1546-1696},
+  doi = {10.1038/s41587-021-00837-3},
+  urldate = {2024-05-21},
+  abstract = {Molecular differences between individual cells can lead to dramatic differences in cell fate, such as death versus survival of cancer cells upon drug treatment. These originating differences remain largely hidden due to difficulties in determining precisely what variable molecular features lead to which cellular fates. Thus, we developed Rewind, a methodology that combines genetic barcoding with RNA fluorescence in situ hybridization to directly capture rare cells that give rise to cellular behaviors of interest. Applying Rewind to BRAFV600E melanoma, we trace drug-resistant cell fates back to single-cell gene expression differences in their drug-naive precursors (initial frequency of {\textasciitilde}1:1,000--1:10,000 cells) and relative persistence of MAP kinase signaling soon after drug treatment. Within this rare subpopulation, we uncover a rich substructure in which molecular differences among several distinct subpopulations predict future differences in phenotypic behavior, such as proliferative capacity of distinct resistant clones after drug treatment. Our results reveal hidden, rare-cell variability that underlies a range of latent phenotypic outcomes upon drug exposure.},
+  copyright = {2021 The Author(s), under exclusive licence to Springer Nature America, Inc.},
+  langid = {english},
+  keywords = {Cancer,Systems biology}
 }
 
 @incollection{gardner2022,
@@ -91,6 +94,23 @@ @article{gutierrez2021
   keywords = {Cancer,Chronic lymphocytic leukaemia,Synthetic biology,Tumour heterogeneity}
 }
 
+@article{holze2024,
+  title = {Analysis of Synthetic Cellular Barcodes in the Genome and Transcriptome with {{BARtab}} and Bartools},
+  author = {Holze, Henrietta and Talarmain, Laure and Fennell, Katie A. and Lam, Enid Y. and Dawson, Mark A. and Vassiliadis, Dane},
+  year = {2024},
+  month = may,
+  journal = {Cell Reports Methods},
+  volume = {4},
+  number = {5},
+  publisher = {Elsevier},
+  issn = {2667-2375},
+  doi = {10.1016/j.crmeth.2024.100763},
+  urldate = {2024-05-21},
+  langid = {english},
+  pmid = {38670101},
+  keywords = {cellular barcoding,CP: Biotechnology,CP: Systems biology,lineage tracing,Nextflow pipeline,R package,single cell,spatial transcriptomics}
+}
+
 @article{johnson2020,
   title = {Integrating Transcriptomics and Bulk Time Course Data into a Mathematical Framework to Describe and Predict Therapeutic Resistance in Cancer},
   author = {Johnson, Kaitlyn E. and Howard, Grant R. and Morgan, Daylin and Brenner, Eric A. and Gardner, Andrea L. and Durrett, Russell E. and Mo, William and Al'Khafaji, Aziz and Sontag, Eduardo D. and Jarrett, Angela M. and Yankeelov, Thomas E. and Brock, Amy},
@@ -108,6 +128,42 @@ @article{johnson2020
   langid = {english}
 }
 
+@article{kebschull2015,
+  title = {Sources of {{PCR-induced}} Distortions in High-Throughput Sequencing Data Sets},
+  author = {Kebschull, Justus M. and Zador, Anthony M.},
+  year = {2015},
+  month = dec,
+  journal = {Nucleic Acids Research},
+  volume = {43},
+  number = {21},
+  pages = {e143},
+  issn = {1362-4962},
+  doi = {10.1093/nar/gkv717},
+  abstract = {PCR permits the exponential and sequence-specific amplification of DNA, even from minute starting quantities. PCR is a fundamental step in preparing DNA samples for high-throughput sequencing. However, there are errors associated with PCR-mediated amplification. Here we examine the effects of four important sources of error-bias, stochasticity, template switches and polymerase errors-on sequence representation in low-input next-generation sequencing libraries. We designed a pool of diverse PCR amplicons with a defined structure, and then used Illumina sequencing to search for signatures of each process. We further developed quantitative models for each process, and compared predictions of these models to our experimental data. We find that PCR stochasticity is the major force skewing sequence representation after amplification of a pool of unique DNA amplicons. Polymerase errors become very common in later cycles of PCR but have little impact on the overall sequence distribution as they are confined to small copy numbers. PCR template switches are rare and confined to low copy numbers. Our results provide a theoretical basis for removing distortions from high-throughput sequencing data. In addition, our findings on PCR stochasticity will have particular relevance to quantification of results from single cell sequencing, in which sequences are represented by only one or a few molecules.},
+  langid = {english},
+  pmcid = {PMC4666380},
+  pmid = {26187991},
+  keywords = {Base Composition,DNA,DNA-Directed DNA Polymerase,High-Throughput Nucleotide Sequencing,Polymerase Chain Reaction,Sequence Analysis DNA,Stochastic Processes,Templates Genetic}
+}
+
+@article{manley2016,
+  title = {Monitoring {{Error Rates In Illumina Sequencing}}},
+  author = {Manley, Leigh J. and Ma, Duanduan and Levine, Stuart S.},
+  year = {2016},
+  month = dec,
+  journal = {Journal of biomolecular techniques: JBT},
+  volume = {27},
+  number = {4},
+  pages = {125--128},
+  issn = {1943-4731},
+  doi = {10.7171/jbt.16-2704-002},
+  abstract = {Guaranteeing high-quality next-generation sequencing data in a rapidly changing environment is an ongoing challenge. The introduction of the Illumina NextSeq 500 and the depreciation of specific metrics from Illumina's Sequencing Analysis Viewer (SAV; Illumina, San Diego, CA, USA) have made it more difficult to determine directly the baseline error rate of sequencing runs. To improve our ability to measure base quality, we have created an open-source tool to construct the Percent Perfect Reads (PPR) plot, previously provided by the Illumina sequencers. The PPR program is compatible with HiSeq 2000/2500, MiSeq, and NextSeq 500 instruments and provides an alternative to Illumina's quality value (Q) scores for determining run quality. Whereas Q scores are representative of run quality, they are often overestimated and are sourced from different look-up tables for each platform. The PPR's unique capabilities as a cross-instrument comparison device, as a troubleshooting tool, and as a tool for monitoring instrument performance can provide an increase in clarity over SAV metrics that is often crucial for maintaining instrument health. These capabilities are highlighted.},
+  langid = {english},
+  pmcid = {PMC5026502},
+  pmid = {27672352},
+  keywords = {Algorithms,Base Sequence,bioinformatics,Diagnostic Errors,genomics,high-throughput DNA,High-Throughput Nucleotide Sequencing,Humans,Molecular Diagnostic Techniques,Sequence Analysis DNA,Software}
+}
+
 @article{martin2011,
   title = {Cutadapt Removes Adapter Sequences from High-Throughput Sequencing Reads},
   author = {Martin, Marcel},
@@ -126,6 +182,41 @@ @article{martin2011
   keywords = {adapter removal,microRNA,next generation sequencing,small RNA}
 }
 
+@article{potapov2017,
+  title = {Examining {{Sources}} of {{Error}} in {{PCR}} by {{Single-Molecule Sequencing}}},
+  author = {Potapov, Vladimir and Ong, Jennifer L.},
+  year = {2017},
+  month = jan,
+  journal = {PLOS ONE},
+  volume = {12},
+  number = {1},
+  pages = {e0169774},
+  publisher = {Public Library of Science},
+  issn = {1932-6203},
+  doi = {10.1371/journal.pone.0169774},
+  urldate = {2024-05-24},
+  abstract = {Next-generation sequencing technology has enabled the detection of rare genetic or somatic mutations and contributed to our understanding of disease progression and evolution. However, many next-generation sequencing technologies first rely on DNA amplification, via the Polymerase Chain Reaction (PCR), as part of sample preparation workflows. Mistakes made during PCR appear in sequencing data and contribute to false mutations that can ultimately confound genetic analysis. In this report, a single-molecule sequencing assay was used to comprehensively catalog the different types of errors introduced during PCR, including polymerase misincorporation, structure-induced template-switching, PCR-mediated recombination and DNA damage. In addition to well-characterized polymerase base substitution errors, other sources of error were found to be equally prevalent. PCR-mediated recombination by Taq polymerase was observed at the single-molecule level, and surprisingly found to occur as frequently as polymerase base substitution errors, suggesting it may be an underappreciated source of error for multiplex amplification reactions. Inverted repeat structural elements in lacZ caused polymerase template-switching between the top and bottom strands during replication and the frequency of these events were measured for different polymerases. For very accurate polymerases, DNA damage introduced during temperature cycling, and not polymerase base substitution errors, appeared to be the major contributor toward mutations occurring in amplification products. In total, we analyzed PCR products at the single-molecule level and present here a more complete picture of the types of mistakes that occur during DNA amplification.},
+  langid = {english},
+  keywords = {DNA damage,DNA polymerase,DNA replication,DNA sequencing,Next-generation sequencing,Polymerase chain reaction,Polymerases,Substitution mutation}
+}
+
+@misc{putri2023,
+  title = {Extraction and Quantification of Lineage-Tracing Barcodes with {{NextClone}} and {{CloneDetective}}},
+  author = {Putri, Givanna H. and Pires, Nichelle and Davidson, Nadia M. and Blyth, Catherine and Al'Khafaji, Aziz M. and Goel, Shom and Phipson, Belinda},
+  year = {2023},
+  month = nov,
+  primaryclass = {New Results},
+  pages = {2023.11.19.567755},
+  publisher = {bioRxiv},
+  doi = {10.1101/2023.11.19.567755},
+  urldate = {2024-05-21},
+  abstract = {Summary The study of clonal dynamics has significantly advanced our understanding of cellular heterogeneity and lineage trajectories. With recent developments in lineage-tracing protocols such as ClonMapper or SPLINTR, which combine DNA barcoding with single-cell RNA sequencing (scRNA-seq), biologists can trace the lineage and evolutionary paths of individual clones while simultaneously observing their transcriptomic changes over time. Here, we present NextClone and CloneDetective, an integrated highly scalable Nextflow pipeline and R package for efficient extraction and quantification of clonal barcodes from scRNA-seq data and DNA sequencing data tagged with lineage-tracing barcodes. We applied both NextClone and CloneDetective to data from a barcoded MCF7 cell line and demonstrate their utility for advancing clonal analysis in the era of high-throughput sequencing. Availability and implementation NextClone and CloneDetective are freely available and open-source on github (https://github.com/phipsonlab/NextClone and https://github.com/phipsonlab/CloneDetective). Documentations and tutorials for NextClone and CloneDetective can be found at https://phipsonlab.github.io/NextClone/ and https://phipsonlab.github.io/CloneDetective/respectively.},
+  archiveprefix = {bioRxiv},
+  chapter = {New Results},
+  copyright = {{\copyright} 2023, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution-NonCommercial-NoDerivs 4.0 International), CC BY-NC-ND 4.0, as described at http://creativecommons.org/licenses/by-nc-nd/4.0/},
+  langid = {english}
+}
+
 @misc{pysam2024,
   title = {Pysam-Developers/Pysam},
   year = {2024},

diff --git a/paper/paper.md b/paper/paper.md
@@ -36,9 +36,17 @@ DNA sequencing and DNA barcoding specifically, have become
 more common as a modality for the characterization of clonal and lineage-specific subpopulations of cells.
 As researchers leverage these technologies, they'll require tools easy to setup
 and use to facilitate downstream biological analysis.
-DNA barcode sequencing data is prone to noise from both preparation
-and sequencing that requires identification and correction prior to statistical testing.
-`Pycashier` fills this gap while also providing a simple-to-use interface.
+DNA barcode sequencing suffers from several sources of noise that must be accounted for prior to statistical analysis.
+This noise can arise in typical Polymerase Chain Reaction (PCR) preparation [@kebschull2015;@potapov2017] or during read-out [@manley2016].
+Historically, the analysis of DNA barcoding has relied on tailored computational workflows,
+such as TimeMachine [@emert2021], that are difficult to parameterize
+or extend to similarly designed DNA barcoding systems.
+Recently, there has been the development of several NextFlow-based techniques,
+such as BARtab [@holze2024] and NextClone [@putri2023],
+which offer improved support for generalized approaches to processing barcode sequencing data.
+However, they require familiarity with NextFlow, which may be uncommon for experimentalists.
+`Pycashier` aims to be simple to install and generalizable enough to be useful to the broader community
+while also providing a user friendly interface.
 
 # Implementation and Usage
 
@@ -52,7 +60,7 @@ The `pycashier` CLI has four subcommands to facilitate processing of DNA barcode
 Users can specify parameters either through command-line flags or through a `toml` file.
 
 `Pycashier` is primarily used for generating counts of individual barcode sequences
-from targeted PCR (Polymerase Chain Reaction) amplifications of DNA-barcoded cells.
+from targeted PCR amplifications of DNA-barcoded cells.
 This is done with `pycashier extract`, which accepts a directory of
 `fastq` files directly from Illumina sequencing and generates
 a `tsv` of individual barcodes and counts for each input `fastq`.