From 988f12d82712cf6687fa6e5b3c525704b88b48b9 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni Date: Mon, 2 Mar 2020 18:51:51 -0500 Subject: [PATCH] Updates to generate v15 CI files (#575) * Update to accommodate v15 * Add biospecimen.RDS Co-authored-by: Candace Savonen Co-authored-by: jashapiro --- .../01-get_biospecimen_identifiers.R | 7 +++++-- .../create-subset-files/02-subset_files.R | 6 +++++- .../biospecimen_ids_for_subset.RDS | Bin 35149 -> 42264 bytes .../create_subset_files.sh | 2 +- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/analyses/create-subset-files/01-get_biospecimen_identifiers.R b/analyses/create-subset-files/01-get_biospecimen_identifiers.R index a75a39da32..a4b1195330 100644 --- a/analyses/create-subset-files/01-get_biospecimen_identifiers.R +++ b/analyses/create-subset-files/01-get_biospecimen_identifiers.R @@ -16,7 +16,7 @@ # consideration. This number will be 10% of num_matched. # - We include (and hardcode) a set of biospecimen IDs for samples that have # TP53 and NF1 mutations that meet the criteria in the tp53_nf1_module and -# are represented in the stranded RNA-seq dataset. +# are represented in the stranded RNA-seq dataset. # See 00-enrich-positive-examples for more information. # # EXAMPLE USAGE: @@ -73,6 +73,9 @@ get_biospecimen_ids <- function(filename, id_mapping_df) { } else { biospecimen_ids <- unique(cnv_file$ID) } + } else if (grepl("consensus_seg_annotated", filename)) { + annotated_cn_file <- read_tsv(filename) + biospecimen_ids <- unique(annotated_cn_file$biospecimen_id) } else if (grepl("pbta-fusion", filename)) { fusion_file <- read_tsv(filename) # the biospecimen IDs in the filtered/prioritize fusion list included with @@ -127,7 +130,7 @@ option_list <- list( make_option( c("-r", "--supported_string"), type = "character", - default = "pbta-snv|pbta-cnv|pbta-fusion|pbta-isoform|pbta-sv|pbta-gene|cnv_consensus", + default = "pbta-snv|pbta-cnv|pbta-fusion|pbta-isoform|pbta-sv|pbta-gene|consensus_seg_annotated", help = "string for pattern matching used to subset to only supported files" ), make_option( diff --git a/analyses/create-subset-files/02-subset_files.R b/analyses/create-subset-files/02-subset_files.R index 1b2ae1d8e7..759cf6cf68 100644 --- a/analyses/create-subset-files/02-subset_files.R +++ b/analyses/create-subset-files/02-subset_files.R @@ -96,7 +96,11 @@ subset_files <- function(filename, biospecimen_ids, output_directory) { cnv_file %>% dplyr::filter(!!rlang::sym(biospecimen_column) %in% biospecimen_ids) %>% readr::write_tsv(output_file) - + } else if (grepl("consensus_seg_annotated", filename)) { + annotated_cn_file <- readr::read_tsv(filename) + annotated_cn_file %>% + dplyr::filter(biospecimen_id %in% biospecimen_ids) %>% + readr::write_tsv(output_file) } else if (grepl("pbta-fusion", filename)) { # original files contain the biospecimen IDs in a column called 'tumor_id', # the filtered/prioritized list biospecimen IDs are in 'Sample' diff --git a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS index a7bae9e69f7ba42398be1d4280857eaf32eef79c..2849d52a0f6dd3888a61dcb5f4cabaa2c5024fe2 100644 GIT binary patch literal 42264 zcmeHQTW{OQ6?S(w1=@XCpl^NX{sK8e4zF$Zf%hp1vct>r_F^cu5*V>1LsE=Qe|%Be z5tGAcnjO=kW5)xmHpzfGn#-B*+z;zN9~~WifAswyyZ^uF{(AJ{*TSC}M@QfNrTfRv z-RGz7Lyvwq`cwDkk9K~7bXBq18o{ANMKp5^IFveaS|j*HNx~{|Pzj!r3R?Mjq=6yT zrE!{;0T|LW1FOhu;ZSIM$W(MVR2VwRX$^)9Q}siv6ky1-?J9R%_=~2G%(~QM@RVJ( zMO+mz7;=#6N0|Z12RI80yLG`+UTo^Bn?mwIU{*LZ8}O8$s)lQ|a3_MIQA3m~;HfZ( zvP4fTFcjA^ZW{z?Q8T1k=)@!4Q?gMyn$g4k@nD z;2a(;O=_38J_b)^8dj<(fya^Aa*iU`hfgI*l_mk)!dzFqHgqfSRBk$jf-HF2xu2zV ztixTc%)~SlC52y1bD{=1+=;wsWk;9bQ*}(uI>}7%#m4o@*p=blw?@-yX#h8;a&;>$ z4Y=&8B7sFRxL>tjC|*J!Gc}QmtHy_W-;`PIcn(}pi*-~easuYn63Mq~72Y(-X=&-y z8F)$#6+ci2WG00YDYgUnlo~pcq$QA}4JoHGso}DH$BLp>hQH_+rdlWsyvVth>Pck5 zohT~9O^FeLc~zRClwRq$XqrKd)K8VM}OB34cP0Pcj< zP}?FQB=5)xQGFM}kZ(jvna1#T_z||%+$Df~5Y;Zm4ICm~sLLgWU!<7&DlXx&sS;^b zG{7868{}0KE-0%LwNM*4uOf?xoKtUy`~L=$nHcvKDX)^sg7X_K3ixoJ0rH~}J#er? z%mQ){AEJl8>`^IzY|IJr3`8fM8-yBUy(A84ZIloIA1&?+kl$%o0_1yLQ-FNG6W<8P ztBI~53U#}QLEefoV$%fvA_Mmwr)^P^La91l6pYp66FGEoSwr|{DfbNAf2^9kKy-oV zUtMiC-Pj;f1}zMs$xrIRab#TTjst9msX<<{)RySb{lEbEhevb)-@W=Gye|Dt=&|OT zK?T%Y3+T~ll| zb|D+{xogJzW=8Aiv7Z?Ug??d%1D+NQ#fAYFbWorCw3>V!VT z_s8+oK^{h8Cicls4Dy!kb}oMjf))e!oigwf$A&?p@wm_Bn+e@Gmb>66Fkrp{?oS+L zofgr@!*}l6N^ZgUX;q@!NE8^loI3Doc(RHJbQ+3SY!84U7TfQ{!^UDC5cg%AR>-y? zFw0(8uBkqQ9lji7)H(cBDR*@_@>;#U@1djOe`HDNhw3ggR>XXp}!I$;ilh- z=a6uNlNx8X16}JP2d8I%L*?K=%l~QX`v=A9$Zw=zki7P2Jvh%)lYgYik=R#hW)4z2 zN9Ri#O_w7YyAiMyABDRHkxH-hfwm?*I6W|bbq@~A%YEXAZWo2_66h&mfV`7>UNOu| zoLEO6uaM_mim#%C6_FVadwT|P2QlYT;;`|*ArKIwo1sWuIm3eL=? zC#qIe&>|b1VBeO0Q8gM&RST)}3(u7!Z4Ok`+wOJ(T zKFl)6_G@%1J@&&^h29{Bna@B1zDvlsUJM)q`7Ah$S!<-JBCj1Xcd6bar&`V9;*ayY;cB= zW_8d?HqBMSbWjY)6vz9;ZO^lkjUg{(jB?48p&y;$GUKd18(V4?R49n57^+p12((W{ z!$oYy1Z6OWq1ec_$^tqM8Mt4CCQ*`h5Q^Kc&Gb-$Y0)Lgd?;7`>z75SLMu1$Yy_Sf zEmaxv@@Wf4%}`y-8Sn<5>tMWa+%KZMqa^%r>d=JZ6MLvOjG{K6wZ~Va{#eBxv8||y zQWz>{n8LACNRe7WE0mF`9P3dNAPMR$Y`t zK3s^+F*ERjx$|P8}Ke6!&Ddm$+94@Bmgi>qa}@_Cpc6R*j#V4@HwPdnOKITo?cm21fyV_HqCzQ z^N?;Nh5<9RSW!xwggI0Xo$ zGqG4<$PiBt3yT`?jvvy@Aw}TkG==Gp<2$oI&*qI)mJ*uRunRFm_#9Ny?6+aBBhr|b z1~acXlh;I?=Du+nY{!AG1+bKn5yc;!LByFZx1VS9jWlcjBv!ijL+G-SKn3pGjplGQ z6IPQhB`TL-t|Q<3MEJ}czW&08lt+BR=PNW5_PZCuXc6=_G8&a-60WuG-73Jsc&^U5 zeGw((QLm6!hb-g}ZwlGTZ$>cw{#^<_37>CP(qKnNay=+>#cfOsRfpA7Dh|@9^q}wW z;f+K_6eQ%~t{PS;r5- zL4tz>2Yq-yz34jBYfBH?bx2^4z@YnK5X{(TIaS7%p^Ls%JtFWm-QaU^9I`Zmkqvav zdTi0RYUfJei>osH*rIP$e}^XQTO|PS=uT_|b+C8eo431pxLt@vV0*6#yszPynD6MXe00L7ZWhbUyUodF@#bW{T&}kB?c(g@ba^s= zw_R;k9nItI=KA=}e|9AQbq|uib-!?S@_tBA`#pkJueS5q>GJyK^Y(mu+|BgUbWHwl z_e&iT_~%^+Y}c!c*XzaNbhre*NxXiyIbSVj^Y!}t)qF_6=9lGzlGFXyj^?DKbSY8Uh6>0*0)y|}ui@)5lL!A+strjF<2eG+_l zakHfEz!n#8=L{PB)3~0m&(2R7D^d7Xay`48FSnneO~&=z+iM8;`t~ZmzIuDvG1JAx z{HlYCJr*bTqr69N+zq^)>GtY!j~pLToNrdISL>T)azhZ`&FKk$zNxIK{(8Ti1ZTv+ wjo(!ZZbk0@Wq@M_NxmImlDt_ygJQw81O1+U;jkFA<2%u2o!5Zq(b0eZ2f)c3l>h($ literal 35149 zcmeHQTW{OQ6?S(w1-5-D(3gGa{scL^U)t^i@5Bsdz}xaNFM(n+fe~9aB*kd@ zGkX{j$%K&o?z=^UY!Xm$S38@6W#fbNA=7?w@DRel5a$^PLXuJxsDTcPG{rCdl)xg>p-tYTu!!nXl6e*=QZl>nGD!tR zs%#~?R+*qkv%|D5lmHazenQf;j9^jV<%*jYu!t&A5D^~~8G6H-DjpOWma0nugHIth zkZcWB!m3cFIS#C9SP_(9uLK|Tb4|6p8df!+N{LO{fTzMRN(~vq&WW;IEi?swV{8!z z`&wy@#YOwtf2VzH5U7KY|=);LVE zGw>8{A~iNV0~8TLGbRVHLDZ&lZCW*WipHU719Vd2ULjpyOAC$ybG*HBRJ*J8cyRh<$!b~CsJXObq8mbzc(RCJDzO2Ir zH9=7@nZluMh=V*u$$_mVvprMuU{#rFO5K!$MTKKmiB`awQ#4M)Y*qnVwV5KXL~vp@ zU6W%agx_csBw;v*%PK8Y%9I8cVN_Id=E6#(ajuq9lz?xP2?@;5h2~{yl9WYoGU-ZC z1dUM~|Og(~c5!!LBnh~a{9Vy8~RXt1`i>gu@J*$ZFpcp9WLNcJ~UVT+RR$$8~qSfv3C?W~v2F>H2tZIykz^U<6-h3#NPY4^Yk5 z-6VzQw=v}8a|Q^Qd~&-@KGQRllg=41*e7MjwGsl0u)_k{Fvz}zGoZ)B@TkrJ=$qUp zzDiY!5`#f5eA;C74*{l=-KYXNzZ5z534kds)B+70-GX!W_&RsaIYKaxAAmAwQAeo* z{rpC)8nIN`;Pm;;aZVf-0V-wK#_6YZF3y>lL1X(h+$VFtkdf+q<(QP0QmQyGLNtq* zl<=AMI5t*Tj`_!j-05H%G_0;7h!>|F^Cy+5MWG4 zvStvv-;L()6FF5BD23@jkC&aeY8~tFJh@LwEBI?a0X~hdKLLs+?dqWT2&Ta$RLXV* zE|siWdS=0h#|U%a)2WPzQbvR%hNb(CM+Sn8lh$y2AlUc?j^|WC++^>I7I&%>QbJ9F zK2`n3x5s%Vkt^GWx}$)QP)it8jeK)bx5;~4cUFrNGlgipPZ8UQVvR4BH(7W#d8@)| zkZF1bm2}fLp$=4Zn?Rk`frV!yny-(6{qHK-g4iUMG8hEWT*-CH0xp=)b1-HKRIF9V zjy1}lMvhqxGtvlxX&_NGDH8gmANvP{qr+}5MuSwtw;&V5EW@%H!jRn9CS;NjF&zdS z4!jBRsRk94n3+okU^-by_|QAmK*V`qZ6Qt~t@2_xH)gd}P2QIb6xVf`!6TF-`=wo& z&@scedvWedsB+4ou>JU|Y*dGqqc~8oIwI7@fo>4eK-@U6Q|yFxuTH;-t4fCou@TZb zc|)7NdmUCIeIi>Gyj-x)=;mZHeM)YnlfE}X5MqcFP(MGJU+)u+GP7V=&Y-VmguY}f zE{T#tXUYf*A7M0n#GLQ&J?3dTtXpZWXvfFxv?Ph0w1&)du0DEPM@WtdBsY8~jgGN+ zazf$J9LIU%xGLNMS!U`1#_;fw)wzpo1xD5qH3@aKgyh~NCZdrfJ)JZJe7`WF0%sA0 zIA+TVt+fFk8)2qyPE;0G8aS*82}|!o9lHsb$4MDW@C?0I+~0MS&H~Y^L6~ z^4Ae>?P-JjToGNs_be}Rj{vh%=9-@m?zr*+bC8FQof~iqJ?%K}EC{(n(ul|dS; zoL5>pa#qlITQHiNwPOt8ZzRpF+=Gg zf(A*1pzZxi`_^eY%xbt#ca1?Bt5Rbq|2i~$hlvs!CCmb;?=iWmS*2Q+M+ru!1Dt6z zr|=v3_`V;+DAYVC9OTsholXkYd4T*(r<@Hoj2)-JQMxKd;E8We1Nwi&%_InXQH2tBUdCuo{7G5M4pMhCk~irLco!L zqxUZY6mZn9xRIBu2{`IIN5B!sJ1F^l<01=JKdJa$vLo10-(Lcb`koMQB;e@rJ;ozx zmH4jDOpe|OIO-RnfTO-A1RM!C5^ywlU-sy=i-YQR1soj~j(+&A%O3Qnr_lbh)$*p@ zJf#!*`Ki!2KY!l+TrRiE=j-;mU2fXNy{s)1Ns+oA_4(b)?Q(Ikx_|h6bG1F+w6D)! z|M%(BHvgw@SDSXVdAk`J`1j`+*sgD{U#;8r;(WWg|J+Kvdb_#0T`iXD_0`Me(16QT zi6;YhZ@0_s)qT6TU0vM1ZdX^2wR!4~o+l#mBb-UVLzgu+1+sj2~uH7tNzS}Hs z?ylR1NBeVMt*72G^5oqqB! zZXRb(HxIWrFBe@^!5H&Z@FV6fZZEG^uYtvotI6XQ&u^BmhSTSt+}5s_tBZDfe&60b zT6vEa|KhgLbyGLzlwA{ic=6DrZot~>H_H(l{MEQ$t}m}HMp~lrwd8(rvs`UIK%0!~ zvyaOV%O*fgYua|cnTihh}m^ z5Z}t_4S(3FV^#h2%X$)w(FktfFEf5VVv?^1M(5N+^G7TeT$Swm^do~cVw-%H?XmG1 K7(F}t-~RzEpii6t diff --git a/analyses/create-subset-files/create_subset_files.sh b/analyses/create-subset-files/create_subset_files.sh index 06e21c2e14..d169282e60 100755 --- a/analyses/create-subset-files/create_subset_files.sh +++ b/analyses/create-subset-files/create_subset_files.sh @@ -7,7 +7,7 @@ set -o pipefail # Set defaults for release and biospecimen file name BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS} -RELEASE=${RELEASE:-release-v14-20200203} +RELEASE=${RELEASE:-release-v15-20200228} NUM_MATCHED=${NUM_MATCHED:-15} # This option controls whether or not the two larger MAF files are skipped as