From bf0cb944240967cde43d3ce846d685e6f0ac0156 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Fri, 27 May 2016 16:43:56 -0500 Subject: [PATCH] Add sequence, slice, and read schema --- src/main/resources/avro/bdg.avdl | 217 +++++++++++++++++++++++-------- 1 file changed, 164 insertions(+), 53 deletions(-) diff --git a/src/main/resources/avro/bdg.avdl b/src/main/resources/avro/bdg.avdl index b2edba0..fd25661 100644 --- a/src/main/resources/avro/bdg.avdl +++ b/src/main/resources/avro/bdg.avdl @@ -22,8 +22,6 @@ protocol BDG { /** Record for describing a reference assembly. Not used for storing the contents of said assembly. - - @see NucleotideContigFragment */ record Contig { /** @@ -328,55 +326,6 @@ enum Base { D // not C } -/** - Stores a contig of nucleotides; this may be a reference chromosome, may be an - assembly, may be a BAC. Very long contigs (>1Mbp) need to be split into fragments. - It seems that they are too long to load in a single go. For best performance, - it seems like 10kbp is a good point at which to start splitting contigs into - fragments. - */ -record NucleotideContigFragment { - /** - The contig identification descriptor for this contig. - */ - union { null, Contig } contig = null; - /** - A description for this contig. When importing from FASTA, the FASTA header - description line should be stored here. - */ - union { null, string } description = null; - /** - The sequence of bases in this fragment. - */ - union { null, string } fragmentSequence = null; - /** - In a fragmented contig, the position of this fragment in the set of fragments. - Can be null if the contig is not fragmented. - */ - union { null, int } fragmentNumber = null; - /** - The position of the first base of this fragment in the overall contig. E.g., - if all fragments are 10kbp and this is the third fragment in the contig, - the start position would be 20000L. - */ - union { null, long } fragmentStartPosition = null; - /** - The position of the last base of this fragment in the overall contig. E.g., - if all fragments are 10kbp and this is the third fragment in the contig, - the end position would be 29999L. - */ - union { null, long } fragmentEndPosition = null; - /** - The length of this fragment. - */ - union { null, long } fragmentLength = null; - /** - The total count of fragments that this contig has been broken into. Can be - null if the contig is not fragmented. - */ - union { null, int } numberOfFragmentsInContig = null; // total number of fragments in contig -} - /** Descriptors for the type of a structural variant. The most specific descriptor should be used, if possible. E.g., duplication should be used instead of @@ -1183,7 +1132,7 @@ record Feature { /** Sample. -*/ + */ record Sample { /** @@ -1208,4 +1157,166 @@ record Sample { */ map attributes = {}; } -} \ No newline at end of file + +/** + Alphabet. + */ +enum Alphabet { + + /** + DNA alphabet. + */ + DNA, + + /** + RNA alphabet. + */ + RNA, + + /** + Protein alphabet. + */ + PROTEIN +} + +/** + Sequence. + */ +record Sequence { + + /** + Name of this sequence. + */ + union { null, string } name = null; + + /** + Description for this sequence. + */ + union { null, string } description = null; + + /** + Alphabet for this sequence, defaults to Alphabet.DNA. + */ + union { Alphabet, null } alphabet = "DNA"; + + /** + Sequence. + */ + union { null, string } sequence = null; + + /** + Length of this sequence. + */ + union { null, long } length = null; +} + +/** + View on a contiguous region of a sequence. + */ +record Slice { // extends Sequence + + /** + Name of the sequence this slice views. + */ + union { null, string } name = null; + + /** + Description for the sequence this slice views. + */ + union { null, string } description = null; + + /** + Alphabet for the sequence this slice views, defaults to Alphabet.DNA. + */ + union { Alphabet, null } alphabet = "DNA"; + + /** + Sequence for this slice. + */ + union { null, string } sequence = null; + + /** + Start position for this slice on the sequence this slice views, in 0-based coordinate + system with closed-open intervals. + */ + union { null, long } start = null; + + /** + End position for this slice on the sequence this slice views, in 0-based coordinate + system with closed-open intervals. + */ + union { null, long } end = null; + + /** + Strand for this slice, if any. Defaults to Strand.Independent. + */ + union { Strand, null } strand = "Independent"; + + /** + Length of this slice. + */ + union { null, long } length = null; +} + +/** + FASTQ sequence format variant. + */ +enum FastqVariant { + + /** + Sanger and Illumina version >= 1.8 FASTQ sequence format variant. + */ + SANGER, + + /** + Solexa and Illumina version 1.0 FASTQ sequence format variant. + */ + SOLEXA, + + /** + Illumina version >= 1.3 and < 1.8 FASTQ sequence format variant. + */ + ILLUMINA +} + +/** + Sequence with quality scores. + */ +record Read { // extends Sequence + + /** + Name of this read. + */ + union { null, string } name = null; + + /** + Description for this read. + */ + union { null, string } description = null; + + /** + Alphabet for this read, defaults to Alphabet.DNA. + */ + union { Alphabet, null } alphabet = "DNA"; + + /** + Sequence for this read. + */ + union { null, string } sequence = null; + + /** + Length of this read. + */ + union { null, long } length = null; + + /** + FASTQ sequence format variant for this read, defaults to FastqVariant.SANGER. + */ + union { FastqVariant, null } fastqVariant = "SANGER"; + + /** + Quality scores for this read. + */ + union { null, string } qualityScores = null; +} +}