From bf0cb944240967cde43d3ce846d685e6f0ac0156 Mon Sep 17 00:00:00 2001
From: Michael L Heuer <heuermh@acm.org>
Date: Fri, 27 May 2016 16:43:56 -0500
Subject: [PATCH] Add sequence, slice, and read schema

---
 src/main/resources/avro/bdg.avdl | 217 +++++++++++++++++++++++--------
 1 file changed, 164 insertions(+), 53 deletions(-)
diff --git a/src/main/resources/avro/bdg.avdl b/src/main/resources/avro/bdg.avdl
index b2edba0..fd25661 100644
--- a/src/main/resources/avro/bdg.avdl
+++ b/src/main/resources/avro/bdg.avdl
@@ -22,8 +22,6 @@ protocol BDG {
 /**
  Record for describing a reference assembly. Not used for storing the contents
  of said assembly.
-
- @see NucleotideContigFragment
  */
 record Contig {
   /**
@@ -328,55 +326,6 @@ enum Base {
   D  // not C
 }
 
-/**
- Stores a contig of nucleotides; this may be a reference chromosome, may be an
- assembly, may be a BAC. Very long contigs (>1Mbp) need to be split into fragments.
- It seems that they are too long to load in a single go. For best performance,
- it seems like 10kbp is a good point at which to start splitting contigs into
- fragments.
- */
-record NucleotideContigFragment {
-  /**
-   The contig identification descriptor for this contig.
-   */
-  union { null, Contig } contig = null;
-  /**
-   A description for this contig. When importing from FASTA, the FASTA header
-   description line should be stored here.
-   */
-  union { null, string } description = null;
-  /**
-   The sequence of bases in this fragment.
-   */
-  union { null, string } fragmentSequence = null;
-  /**
-   In a fragmented contig, the position of this fragment in the set of fragments.
-   Can be null if the contig is not fragmented.
-   */
-  union { null, int } fragmentNumber = null;
-  /**
-   The position of the first base of this fragment in the overall contig. E.g.,
-   if all fragments are 10kbp and this is the third fragment in the contig,
-   the start position would be 20000L.
-   */
-  union { null, long } fragmentStartPosition = null;
-  /**
-   The position of the last base of this fragment in the overall contig. E.g.,
-   if all fragments are 10kbp and this is the third fragment in the contig,
-   the end position would be 29999L.
-   */
-  union { null, long } fragmentEndPosition = null;
-  /**
-   The length of this fragment.
-   */
-  union { null, long } fragmentLength = null;
-  /**
-   The total count of fragments that this contig has been broken into. Can be
-   null if the contig is not fragmented.
-   */
-  union { null, int } numberOfFragmentsInContig = null; // total number of fragments in contig
-}
-
 /**
  Descriptors for the type of a structural variant. The most specific descriptor
  should be used, if possible. E.g., duplication should be used instead of
@@ -1183,7 +1132,7 @@ record Feature {
 
 /**
  Sample.
-*/
+ */
 record Sample {
 
   /**
@@ -1208,4 +1157,166 @@ record Sample {
    */
   map<string> attributes = {};
 }
-}
\ No newline at end of file
+
+/**
+ Alphabet.
+ */
+enum Alphabet {
+
+  /**
+   DNA alphabet.
+   */
+  DNA,
+
+  /**
+   RNA alphabet.
+   */
+  RNA,
+
+  /**
+   Protein alphabet.
+   */
+  PROTEIN
+}
+
+/**
+ Sequence.
+ */
+record Sequence {
+
+  /**
+   Name of this sequence.
+   */
+  union { null, string } name = null;
+
+  /**
+   Description for this sequence.
+   */
+  union { null, string } description = null;
+
+  /**
+   Alphabet for this sequence, defaults to Alphabet.DNA.
+   */
+  union { Alphabet, null } alphabet = "DNA";
+
+  /**
+   Sequence.
+   */
+  union { null, string } sequence = null;
+
+  /**
+   Length of this sequence.
+   */
+  union { null, long } length = null;
+}
+
+/**
+ View on a contiguous region of a sequence.
+ */
+record Slice { // extends Sequence
+
+  /**
+   Name of the sequence this slice views.
+   */
+  union { null, string } name = null;
+
+  /**
+   Description for the sequence this slice views.
+   */
+  union { null, string } description = null;
+
+  /**
+   Alphabet for the sequence this slice views, defaults to Alphabet.DNA.
+   */
+  union { Alphabet, null } alphabet = "DNA";
+
+  /**
+   Sequence for this slice.
+   */
+  union { null, string } sequence = null;
+
+  /**
+   Start position for this slice on the sequence this slice views, in 0-based coordinate
+   system with closed-open intervals.
+   */
+  union { null, long } start = null;
+
+  /**
+   End position for this slice on the sequence this slice views, in 0-based coordinate
+   system with closed-open intervals.
+   */
+  union { null, long } end = null;
+
+  /**
+   Strand for this slice, if any.  Defaults to Strand.Independent.
+   */
+  union { Strand, null } strand = "Independent";
+
+  /**
+   Length of this slice.
+   */
+  union { null, long } length = null;
+}
+
+/**
+ FASTQ sequence format variant.
+ */
+enum FastqVariant {
+
+  /**
+   Sanger and Illumina version &gt;= 1.8 FASTQ sequence format variant.
+   */
+  SANGER,
+
+  /**
+   Solexa and Illumina version 1.0 FASTQ sequence format variant.
+   */
+  SOLEXA,
+
+  /**
+   Illumina version &gt;= 1.3 and &lt; 1.8 FASTQ sequence format variant.
+   */
+  ILLUMINA
+}
+
+/**
+ Sequence with quality scores.
+ */
+record Read { // extends Sequence
+
+  /**
+   Name of this read.
+   */
+  union { null, string } name = null;
+
+  /**
+   Description for this read.
+   */
+  union { null, string } description = null;
+
+  /**
+   Alphabet for this read, defaults to Alphabet.DNA.
+   */
+  union { Alphabet, null } alphabet = "DNA";
+
+  /**
+   Sequence for this read.
+   */
+  union { null, string } sequence = null;
+
+  /**
+   Length of this read.
+   */
+  union { null, long } length = null;
+
+  /**
+   FASTQ sequence format variant for this read, defaults to FastqVariant.SANGER.
+   */
+  union { FastqVariant, null } fastqVariant = "SANGER";
+
+  /**
+   Quality scores for this read.
+   */
+  union { null, string } qualityScores = null;
+}
+}