From 325b8e5dc5378549c81fb9f41e025ab1611380f2 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 08:52:04 +0200 Subject: [PATCH 01/86] Improving odgi figures --- config.yaml | 4 ++-- scripts/contig_position.R | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/config.yaml b/config.yaml index f604523..bb630d2 100644 --- a/config.yaml +++ b/config.yaml @@ -16,8 +16,8 @@ wfmash.secondary: '-k 19 -H 0.001 -X' seqwish.params: '-B 10000000 -k 19 -f 0' # Odgi 1D and path coverage viz parameters -odgi.1Dviz.params: '-x 2000 -a 25 -b' -odgi.pcov.params: '-x 2000 -a 25 -O' +odgi.1Dviz.params: '-x 2000 -a 50 -b -H' +odgi.pcov.params: '-x 2000 -a 50 -O' ## Optional parts of the workflow # Running Quast to get statistics on input haplotypes diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 7ccd486..dc3a2ce 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -33,8 +33,11 @@ my.cytobands <- toGRanges(x) # Creating figure # Customizing figure parameters : see https://bernatgel.github.io/karyoploter_tutorial//Tutorial/PlotParams/PlotParams.html pp <- getDefaultPlotParams(plot.type=1) -pp$ideogramheight <- 200 +pp$ideogramheight <- 50 pp$data1outmargin <- 50 +pp$data2outmargin <- 0 +pp$data1inmargin <- 0 +pp$data2inmargin <- 0 pp$leftmargin <- 0.28 pp$rightmargin <- 0 pp$data1height <- 0 -- GitLab From 9a10808d2453923cb8962ea8a585f77dd01485cf Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 09:32:35 +0200 Subject: [PATCH 02/86] Continuing to improve figs --- config.yaml | 4 ++-- scripts/contig_position.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index bb630d2..05de181 100644 --- a/config.yaml +++ b/config.yaml @@ -16,8 +16,8 @@ wfmash.secondary: '-k 19 -H 0.001 -X' seqwish.params: '-B 10000000 -k 19 -f 0' # Odgi 1D and path coverage viz parameters -odgi.1Dviz.params: '-x 2000 -a 50 -b -H' -odgi.pcov.params: '-x 2000 -a 50 -O' +odgi.1Dviz.params: '-x 2000 -a 150 -b -H' +odgi.pcov.params: '-x 2000 -a 150 -O' ## Optional parts of the workflow # Running Quast to get statistics on input haplotypes diff --git a/scripts/contig_position.R b/scripts/contig_position.R index dc3a2ce..9ac8182 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -33,9 +33,9 @@ my.cytobands <- toGRanges(x) # Creating figure # Customizing figure parameters : see https://bernatgel.github.io/karyoploter_tutorial//Tutorial/PlotParams/PlotParams.html pp <- getDefaultPlotParams(plot.type=1) -pp$ideogramheight <- 50 -pp$data1outmargin <- 50 -pp$data2outmargin <- 0 +pp$ideogramheight <- 100 +pp$data1outmargin <- 25 +pp$data2outmargin <- 25 pp$data1inmargin <- 0 pp$data2inmargin <- 0 pp$leftmargin <- 0.28 -- GitLab From a29eff609cbf481963b432fdd62f42a957da81c4 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 09:38:28 +0200 Subject: [PATCH 03/86] Update contig_position.R --- scripts/contig_position.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 9ac8182..1d6fed9 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -23,6 +23,7 @@ x = read.table(opt$tsv, sep = "\t", comment.char="^") colnames(x) = x[1,] x = x[-1,] my.genome <- toGRanges(x) +nChr = ncol(x) #print("Reading bands file ...") x = read.table(opt$bands, sep="\t", comment.char="^") @@ -43,7 +44,7 @@ pp$rightmargin <- 0 pp$data1height <- 0 pp$data2height <- 0 -png(opt$out, width=2780, height=500, pointsize=6, res=300, antialias = "none") +png(opt$out, width=2780, height=(nChr*150), pointsize=6, res=300, antialias = "none") kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all") kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000) dev.off() \ No newline at end of file -- GitLab From 47c4dce4ff50a4eeb41de9f3551aaaacc9f387b9 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 09:46:58 +0200 Subject: [PATCH 04/86] Update contig_position.R --- scripts/contig_position.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 1d6fed9..0639c1e 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -23,7 +23,7 @@ x = read.table(opt$tsv, sep = "\t", comment.char="^") colnames(x) = x[1,] x = x[-1,] my.genome <- toGRanges(x) -nChr = ncol(x) +nChr = nrow(x) #print("Reading bands file ...") x = read.table(opt$bands, sep="\t", comment.char="^") -- GitLab From 7fd84f0322c67dfaa1849f2d162f23d90ab90121 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 09:55:18 +0200 Subject: [PATCH 05/86] Update contig_position.R --- scripts/contig_position.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 0639c1e..d367beb 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -41,6 +41,8 @@ pp$data1inmargin <- 0 pp$data2inmargin <- 0 pp$leftmargin <- 0.28 pp$rightmargin <- 0 +pp$topmargin <- 0 +pp$bottommargin <- 0 pp$data1height <- 0 pp$data2height <- 0 -- GitLab From 6515a5349ca140ab1156dcd4eabd30098d78f0c1 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 14:49:01 +0200 Subject: [PATCH 06/86] Started creating reports --- Snakefile | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++ config.yaml | 4 ++- 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 5c775d6..1a7dae4 100644 --- a/Snakefile +++ b/Snakefile @@ -59,6 +59,10 @@ def which_analysis(): analysis_inputs.append( expand("output/chr.contig/{haplotype}.contig.png", haplotype=CHRLIST) ) + if config["create_report"] == "True": # Create report + analysis_inputs.append( + "output/pan1c."+config['name']+".report.md" + ) return analysis_inputs """ @@ -675,3 +679,74 @@ rule panacus_stats: -a {params.apppath} \ -t {threads} """ + +rule create_pan1c_report_fig: + # Produces a markdown report figure of chromosomes graphs + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', + contigfig="output/chr.contig/{chromosome}.contig.png", + output: + odgifig=temp("tmp/{chromosome}.odgi.png"), + namefig=temp("tmp/{chromosome}.name.png"), + reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" + threads: 4 + params: + apppath=config['app.path'] + shell: + """ + ## Odgi 1D viz + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + viz -i {input.graph} -o {output.odgifig} -x 2000 -a 150 -b -H -t {threads} -P + + ## Getting legend from contig figure + convert {input.contigfig} -crop 770x+0+0 +repage {output.namefig} + + odgheight=$(identify -ping -format '%h' {output.odgifig}) + ctgheight=$(identify -ping -format '%h' {input.contigfig}) + + combinedheight=$(($odgheight+$ctgheight)) + + echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" + + ## Creating empty canvas ad adding other figures to it + convert -size "2770x$combinedheight" xc:white {output.reportfig} + composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} + composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} + composite -geometry "+770+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} + """ + +rule create_pan1c_report: + # Produces a markdown report of chromosomes graphs + input: + figs=expand("output/"+config['name']+".{chromosome}.report.fig.png", chromsome=CHRLIST), + genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + output: + reportfig="output/pan1c."+config['name']+".report.md" + threads: 4 + params: + apppath=config['app.path'] + run: + shell("touch {output.reportfig}") + + # Adding General stats + shell("echo '# General stats' >> {output.reportfig}") + shell("cat {input.genstats} | tr '\\t' ',' | csv2md >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + + # Adding Path stats + shell("echo '# Path stats' >> {output.reportfig}") + shell("cat {input.pathstats} | tr '\\t' ',' | csv2md >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + + # Adding chromosomes figures + input.figs.sort() + + shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") + for fig in input.figs: + basename=os.path.basename(fig) + chr_name=basename.split('.')[2] + + shell("echo '## {chr_name}' >> {output.reportfig}") + shell("echo '[{basename}]({basename}) >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") \ No newline at end of file diff --git a/config.yaml b/config.yaml index 05de181..3c09fb6 100644 --- a/config.yaml +++ b/config.yaml @@ -16,7 +16,7 @@ wfmash.secondary: '-k 19 -H 0.001 -X' seqwish.params: '-B 10000000 -k 19 -f 0' # Odgi 1D and path coverage viz parameters -odgi.1Dviz.params: '-x 2000 -a 150 -b -H' +odgi.1Dviz.params: '-x 2000 -a 150 -b' odgi.pcov.params: '-x 2000 -a 150 -O' ## Optional parts of the workflow @@ -29,3 +29,5 @@ get_PAV: 'False' # Computes SyRI figures for haplotypes #get_allASM_SyRI: 'False' # All vs all get_ASMs_SyRI: 'False' # Haplotype vs Reference +# Creating final report +create_report: 'True' -- GitLab From 875288343c1b253f03f557d8621538cc915870e0 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 14:56:27 +0200 Subject: [PATCH 07/86] Fixed CICD --- .gitlab-ci.yml | 2 +- example/config_CICD.yaml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 085078b..828b4de 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,7 +14,7 @@ shallow_run_workflow: ## Downloading apps - mkdir appimgs #- apptainer build apps/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest - - apptainer build appimgs/snakebox.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/snakebox:latest + - apptainer pull appimgs/snakebox.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest #- apptainer build apps/pytools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pytools:latest #- apptainer build apps/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml index 4a415bf..648e686 100644 --- a/example/config_CICD.yaml +++ b/example/config_CICD.yaml @@ -29,3 +29,5 @@ get_PAV: 'False' # Computes SyRI figures for haplotypes #get_allASM_SyRI: 'False' # All vs all get_ASMs_SyRI: 'False' # Haplotype vs Reference +# Creating final report +create_report: 'True' -- GitLab From 2e0935dd2bb68ffbfc68d34666398f8a450caadc Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:00:41 +0200 Subject: [PATCH 08/86] Typo --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 1a7dae4..b7ef24c 100644 --- a/Snakefile +++ b/Snakefile @@ -718,7 +718,7 @@ rule create_pan1c_report_fig: rule create_pan1c_report: # Produces a markdown report of chromosomes graphs input: - figs=expand("output/"+config['name']+".{chromosome}.report.fig.png", chromsome=CHRLIST), + figs=expand("output/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" output: -- GitLab From 78c38921c6359962b3d4d6fa00c38c5ac2eb1b6e Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:02:22 +0200 Subject: [PATCH 09/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index b7ef24c..dd513de 100644 --- a/Snakefile +++ b/Snakefile @@ -718,7 +718,7 @@ rule create_pan1c_report_fig: rule create_pan1c_report: # Produces a markdown report of chromosomes graphs input: - figs=expand("output/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), + figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" output: -- GitLab From bc138d4fd351c090782bb7211062993326dea0a9 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:15:37 +0200 Subject: [PATCH 10/86] Update Snakefile --- Snakefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index dd513de..62344de 100644 --- a/Snakefile +++ b/Snakefile @@ -740,10 +740,11 @@ rule create_pan1c_report: shell("echo '' >> {output.reportfig}") # Adding chromosomes figures - input.figs.sort() + fig_list = input.figs + fig_list.sort() shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") - for fig in input.figs: + for fig in fig_list: basename=os.path.basename(fig) chr_name=basename.split('.')[2] -- GitLab From 3e28e9b491f01d9b6fe023204ff120c7208057a7 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:18:39 +0200 Subject: [PATCH 11/86] Update Snakefile --- Snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 62344de..289186a 100644 --- a/Snakefile +++ b/Snakefile @@ -740,7 +740,8 @@ rule create_pan1c_report: shell("echo '' >> {output.reportfig}") # Adding chromosomes figures - fig_list = input.figs + fig_list = [fig for fig in input.figs] + print(fig_list) fig_list.sort() shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") -- GitLab From 35f8cb8bf9d1f07fea917c505ec5cf1cae157e28 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:22:57 +0200 Subject: [PATCH 12/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 289186a..782e219 100644 --- a/Snakefile +++ b/Snakefile @@ -750,5 +750,5 @@ rule create_pan1c_report: chr_name=basename.split('.')[2] shell("echo '## {chr_name}' >> {output.reportfig}") - shell("echo '[{basename}]({basename}) >> {output.reportfig}") + shell("echo '[{basename}]({basename})' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") \ No newline at end of file -- GitLab From 4ba2d57f4518d3d88fed8302c5e1dffd3974358b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:27:51 +0200 Subject: [PATCH 13/86] Update Snakefile --- Snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index 782e219..6d348d0 100644 --- a/Snakefile +++ b/Snakefile @@ -731,12 +731,12 @@ rule create_pan1c_report: # Adding General stats shell("echo '# General stats' >> {output.reportfig}") - shell("cat {input.genstats} | tr '\\t' ',' | csv2md >> {output.reportfig}") + shell("cat {input.genstats} | csv2md -d'\\t' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") # Adding Path stats shell("echo '# Path stats' >> {output.reportfig}") - shell("cat {input.pathstats} | tr '\\t' ',' | csv2md >> {output.reportfig}") + shell("cat {input.pathstats} | csv2md -d'\\t' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") # Adding chromosomes figures @@ -750,5 +750,5 @@ rule create_pan1c_report: chr_name=basename.split('.')[2] shell("echo '## {chr_name}' >> {output.reportfig}") - shell("echo '[{basename}]({basename})' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") \ No newline at end of file -- GitLab From 7ff03996368e37a17a428c3d49e03a3bc675d2dc Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:30:10 +0200 Subject: [PATCH 14/86] Update Snakefile --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 6d348d0..b84a8fd 100644 --- a/Snakefile +++ b/Snakefile @@ -731,12 +731,12 @@ rule create_pan1c_report: # Adding General stats shell("echo '# General stats' >> {output.reportfig}") - shell("cat {input.genstats} | csv2md -d'\\t' >> {output.reportfig}") + shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") # Adding Path stats shell("echo '# Path stats' >> {output.reportfig}") - shell("cat {input.pathstats} | csv2md -d'\\t' >> {output.reportfig}") + shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") # Adding chromosomes figures -- GitLab From aff2e1cbba4192bb58ddea8349e8f38ab3df1128 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:38:18 +0200 Subject: [PATCH 15/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index b84a8fd..479ddc7 100644 --- a/Snakefile +++ b/Snakefile @@ -747,7 +747,7 @@ rule create_pan1c_report: shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") for fig in fig_list: basename=os.path.basename(fig) - chr_name=basename.split('.')[2] + chr_name=basename.split('.')[1] shell("echo '## {chr_name}' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") -- GitLab From 0d907efba1faeeb1635e07d2ac92be5f13d3d3ce Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 15:42:06 +0200 Subject: [PATCH 16/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 479ddc7..e8acceb 100644 --- a/Snakefile +++ b/Snakefile @@ -750,5 +750,5 @@ rule create_pan1c_report: chr_name=basename.split('.')[1] shell("echo '## {chr_name}' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") shell("echo '' >> {output.reportfig}") \ No newline at end of file -- GitLab From 9a42d8fb2f2c634758cdf1720619e02cb2584e3a Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 16:39:01 +0200 Subject: [PATCH 17/86] Better figures --- Snakefile | 11 +++++++---- scripts/contig_position.R | 10 +++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Snakefile b/Snakefile index e8acceb..4fc3014 100644 --- a/Snakefile +++ b/Snakefile @@ -189,7 +189,7 @@ rule contig_position: #cat $base/{wildcards.chromosome}.gff - awk '{{a++; if ((a/2)==int(a/2)){{b="gneg"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ + awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ > $base/{wildcards.chromosome}.bands @@ -696,10 +696,10 @@ rule create_pan1c_report_fig: """ ## Odgi 1D viz apptainer run --app odgi {params.apppath}/PanGeTools.sif \ - viz -i {input.graph} -o {output.odgifig} -x 2000 -a 150 -b -H -t {threads} -P + viz -i {input.graph} -o {output.odgifig} -x 2500 -a 100 -b -H -t {threads} -P ## Getting legend from contig figure - convert {input.contigfig} -crop 770x+0+0 +repage {output.namefig} + convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} odgheight=$(identify -ping -format '%h' {output.odgifig}) ctgheight=$(identify -ping -format '%h' {input.contigfig}) @@ -709,7 +709,7 @@ rule create_pan1c_report_fig: echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" ## Creating empty canvas ad adding other figures to it - convert -size "2770x$combinedheight" xc:white {output.reportfig} + convert -size "2800x$combinedheight" xc:white {output.reportfig} composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} composite -geometry "+770+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} @@ -729,6 +729,9 @@ rule create_pan1c_report: run: shell("touch {output.reportfig}") + # Adding Summary + ## To Do + # Adding General stats shell("echo '# General stats' >> {output.reportfig}") shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") diff --git a/scripts/contig_position.R b/scripts/contig_position.R index d367beb..8d4f5e4 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -34,19 +34,19 @@ my.cytobands <- toGRanges(x) # Creating figure # Customizing figure parameters : see https://bernatgel.github.io/karyoploter_tutorial//Tutorial/PlotParams/PlotParams.html pp <- getDefaultPlotParams(plot.type=1) -pp$ideogramheight <- 100 -pp$data1outmargin <- 25 -pp$data2outmargin <- 25 +pp$ideogramheight <- 80 +pp$data1outmargin <- 10 +pp$data2outmargin <- 10 pp$data1inmargin <- 0 pp$data2inmargin <- 0 -pp$leftmargin <- 0.28 +pp$leftmargin <- 0.24 pp$rightmargin <- 0 pp$topmargin <- 0 pp$bottommargin <- 0 pp$data1height <- 0 pp$data2height <- 0 -png(opt$out, width=2780, height=(nChr*150), pointsize=6, res=300, antialias = "none") +png(opt$out, width=3300, height=(nChr*100), pointsize=6, res=300, antialias = "none") kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all") kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000) dev.off() \ No newline at end of file -- GitLab From 1593a417d6709739197f7f9088aa2c03483749e9 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 17:01:08 +0200 Subject: [PATCH 18/86] Lexographical sorting of chromosomes --- scripts/chrInputStats.py | 3 +++ scripts/chrStatsAggregation.py | 2 ++ scripts/coreStats.py | 5 +++++ 3 files changed, 10 insertions(+) diff --git a/scripts/chrInputStats.py b/scripts/chrInputStats.py index 8b3015b..85d3d26 100644 --- a/scripts/chrInputStats.py +++ b/scripts/chrInputStats.py @@ -55,6 +55,9 @@ Sequence dictionnary : """ seqDict = {} +# Sorting +args.fastafiles.sort() + # Parsing fasta files for filename in args.fastafiles: diff --git a/scripts/chrStatsAggregation.py b/scripts/chrStatsAggregation.py index 3ac2b10..404549e 100644 --- a/scripts/chrStatsAggregation.py +++ b/scripts/chrStatsAggregation.py @@ -43,6 +43,8 @@ args = arg_parser.parse_args() ## Main script # Getting the list of stats files fileList = [k for k in os.listdir(args.inputDir) if k[-3:] == "tsv"] +# Sorting +fileList.sort() stats = { "general": [], diff --git a/scripts/coreStats.py b/scripts/coreStats.py index 91e693f..db0991c 100644 --- a/scripts/coreStats.py +++ b/scripts/coreStats.py @@ -72,6 +72,10 @@ arg_parser.add_argument( ) args = arg_parser.parse_args() +# Sorting +args.chrGraphStatsFiles.sort() +args.chrInputStatsFiles.sort() + ## Toolbox def getRegEquation(x, y): (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True) @@ -89,6 +93,7 @@ PGGBsteps = [] # Storing PGGB steps name # Iterating through all pggb time stats TimeStatsList = [os.path.join(args.pggbStats, file) for file in os.listdir(args.pggbStats) if file.split('.')[-2] == "time"] +TimeStatsList.sort() print(TimeStatsList) for pggbStats in TimeStatsList: -- GitLab From 4330f3e27a76cd44f4a135ec9b6a184154c20c2a Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 17:14:13 +0200 Subject: [PATCH 19/86] Update Snakefile --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 4fc3014..8b778d7 100644 --- a/Snakefile +++ b/Snakefile @@ -709,10 +709,10 @@ rule create_pan1c_report_fig: echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" ## Creating empty canvas ad adding other figures to it - convert -size "2800x$combinedheight" xc:white {output.reportfig} + convert -size "3300x$combinedheight" xc:white {output.reportfig} composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} - composite -geometry "+770+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} + composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} """ rule create_pan1c_report: -- GitLab From 1dbe14c244827c8725481f16fb4f8dc623cd4e8e Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Fri, 21 Jun 2024 18:54:02 +0200 Subject: [PATCH 20/86] Updated report figures --- Snakefile | 2 +- scripts/contig_position.R | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index 8b778d7..5b3598a 100644 --- a/Snakefile +++ b/Snakefile @@ -696,7 +696,7 @@ rule create_pan1c_report_fig: """ ## Odgi 1D viz apptainer run --app odgi {params.apppath}/PanGeTools.sif \ - viz -i {input.graph} -o {output.odgifig} -x 2500 -a 100 -b -H -t {threads} -P + viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P ## Getting legend from contig figure convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 8d4f5e4..01f9f93 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -34,9 +34,9 @@ my.cytobands <- toGRanges(x) # Creating figure # Customizing figure parameters : see https://bernatgel.github.io/karyoploter_tutorial//Tutorial/PlotParams/PlotParams.html pp <- getDefaultPlotParams(plot.type=1) -pp$ideogramheight <- 80 -pp$data1outmargin <- 10 -pp$data2outmargin <- 10 +pp$ideogramheight <- 50 +pp$data1outmargin <- 15 +pp$data2outmargin <- 15 pp$data1inmargin <- 0 pp$data2inmargin <- 0 pp$leftmargin <- 0.24 @@ -46,7 +46,7 @@ pp$bottommargin <- 0 pp$data1height <- 0 pp$data2height <- 0 -png(opt$out, width=3300, height=(nChr*100), pointsize=6, res=300, antialias = "none") +png(opt$out, width=3300, height=(nChr*80), pointsize=6, res=300, antialias = "none") kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all") kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000) dev.off() \ No newline at end of file -- GitLab From f9f272511a61bb99a489e2b6a38b869acf78dcc8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 08:14:08 +0200 Subject: [PATCH 21/86] Update getSyriFigs.sh Renaming chromosomes based on reference to prevent SyRI from matching chromosomes and crashing --- scripts/getSyriFigs.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 3f3c7ff..ab2f0d1 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -45,6 +45,8 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Setting filepaths for later bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam" syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" + hapID=$(basename ${asmList[i + 1]} .fa.gz | sed 's/.hap/#/') + refID=$(basename ${asmList[i]} .fa.gz | sed 's/.hap/#/') # Adding the output syri file to the array syriFileList+=($syriFile) @@ -53,6 +55,9 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do apptainer run $appdir/pan1c-env.sif python scripts/syri_pruning.py \ -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir + # Renaming chromosomes with same ids as the reference + sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) + #Â Minimap2 genome vs genome alignment apptainer run --app minimap2 $appdir/PanGeTools.sif \ -a -x asm5 -c --eqx \ -- GitLab From 856620b5e6de2eb84fb7b7bcfeab53c661ec64c7 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 08:14:52 +0200 Subject: [PATCH 22/86] Update getPanacusHG.sh Not grouping samples anymore --- scripts/getPanacusHG.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/getPanacusHG.sh b/scripts/getPanacusHG.sh index 7f3ddb9..d26dda3 100755 --- a/scripts/getPanacusHG.sh +++ b/scripts/getPanacusHG.sh @@ -35,4 +35,4 @@ grep '^P' $gfa | cut -f2 | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt # Running Panacus echo "[getPanacusHG::panacus] Running on ${gfa}" apptainer run --app panacus $appdir/PanGeTools.sif histgrowth \ - -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -S -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output} + -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output} -- GitLab From cdd042f90b8e40012452b2d60e68b677f2d56c70 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 09:08:03 +0200 Subject: [PATCH 23/86] Update Snakefile Better RagTag fasta emptyness check --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 5b3598a..c7683a2 100644 --- a/Snakefile +++ b/Snakefile @@ -125,8 +125,8 @@ rule ragtag_scaffolding: -q {input.fa} \ -o {output.fa} - if [ ! -s {output.fa} ]; then - echo "Empty fasta" + if [[ -z $(grep '[^[:space:]]' {output.fa}) ]] ; then + echo "Error : Empty final fasta" exit 1 fi """ -- GitLab From 6d6ecbef184438bd350a6eccf790577458a5df48 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 09:09:06 +0200 Subject: [PATCH 24/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index c7683a2..6752afc 100644 --- a/Snakefile +++ b/Snakefile @@ -125,7 +125,7 @@ rule ragtag_scaffolding: -q {input.fa} \ -o {output.fa} - if [[ -z $(grep '[^[:space:]]' {output.fa}) ]] ; then + if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then echo "Error : Empty final fasta" exit 1 fi -- GitLab From 2d0eb5bdccf2bf161269dca6bddfd1f72510a219 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 09:20:29 +0200 Subject: [PATCH 25/86] Update contig_position.R --- scripts/contig_position.R | 51 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 01f9f93..b0eeb0a 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -33,20 +33,45 @@ my.cytobands <- toGRanges(x) # Creating figure # Customizing figure parameters : see https://bernatgel.github.io/karyoploter_tutorial//Tutorial/PlotParams/PlotParams.html + pp <- getDefaultPlotParams(plot.type=1) -pp$ideogramheight <- 50 -pp$data1outmargin <- 15 -pp$data2outmargin <- 15 -pp$data1inmargin <- 0 -pp$data2inmargin <- 0 -pp$leftmargin <- 0.24 -pp$rightmargin <- 0 -pp$topmargin <- 0 -pp$bottommargin <- 0 -pp$data1height <- 0 -pp$data2height <- 0 - -png(opt$out, width=3300, height=(nChr*80), pointsize=6, res=300, antialias = "none") + +if (nChr >= 4){ + + pp$ideogramheight <- 50 + pp$data1outmargin <- 15 + pp$data2outmargin <- 15 + pp$data1inmargin <- 0 + pp$data2inmargin <- 0 + pp$leftmargin <- 0.24 + pp$rightmargin <- 0 + pp$topmargin <- 0 + pp$bottommargin <- 0 + pp$data1height <- 0 + pp$data2height <- 0 + + png(opt$out, width=3300, height=(nChr*80), pointsize=6, res=300, antialias = "none") + +} else { + # Too low chromosomes cause crash in par handler. + pp$ideogramheight <- 50 + pp$data1outmargin <- 15 + pp$data2outmargin <- 15 + pp$data1inmargin <- 0 + pp$data2inmargin <- 0 + pp$leftmargin <- 0.24 + pp$rightmargin <- 0 + pp$topmargin <- 0.5 + pp$bottommargin <- 0 + pp$data1height <- 0 + pp$data2height <- 0 + + png(opt$out, width=3300, height=(nChr*80*2), pointsize=6, res=300, antialias = "none") + +} + + + kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all") kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000) dev.off() \ No newline at end of file -- GitLab From b8a30f4d9460051f4b68b4cc0987995fb3b6fb84 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 10:21:01 +0200 Subject: [PATCH 26/86] Update Snakefile Allowing retries for SyRI --- Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Snakefile b/Snakefile index 6752afc..e0c3e0d 100644 --- a/Snakefile +++ b/Snakefile @@ -266,6 +266,7 @@ rule create_sglAsm_syri_fig: fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", wrkdir=directory('data/asm.syri/{haplotype}') threads: 4 + retries: 1 resources: mem_gb=48 params: -- GitLab From 0e16a0448879356cb7405f49db2390fb4686b469 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 13:30:42 +0200 Subject: [PATCH 27/86] Update Snakefile --- Snakefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index e0c3e0d..839292a 100644 --- a/Snakefile +++ b/Snakefile @@ -125,10 +125,10 @@ rule ragtag_scaffolding: -q {input.fa} \ -o {output.fa} - if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then - echo "Error : Empty final fasta" - exit 1 - fi + #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then + # echo "Error : Empty final fasta" + # exit 1 + #fi """ rule quast_stats: -- GitLab From a9ed84f4e4f8f1350c226e4744821f0c1214384c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 15:01:25 +0200 Subject: [PATCH 28/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index ab2f0d1..0b02e0d 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -47,6 +47,9 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" hapID=$(basename ${asmList[i + 1]} .fa.gz | sed 's/.hap/#/') refID=$(basename ${asmList[i]} .fa.gz | sed 's/.hap/#/') + + # Debug + echo -e "[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID" # Adding the output syri file to the array syriFileList+=($syriFile) @@ -65,7 +68,8 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do -t $threads > $wrkdir/$bamFile.sam apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile - rm $wrkdir/$bamFile.sam $wrkdir/*.fa + rm $wrkdir/$bamFile.sam + #rm $wrkdir/*.fa # Syri on previous alignment apptainer run $appdir/pan1c-env.sif \ -- GitLab From e01c06ad43a3fc80568bf718568fafdb1bf18b0c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 15:41:09 +0200 Subject: [PATCH 29/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 0b02e0d..8f845b9 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -45,8 +45,8 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Setting filepaths for later bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam" syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" - hapID=$(basename ${asmList[i + 1]} .fa.gz | sed 's/.hap/#/') - refID=$(basename ${asmList[i]} .fa.gz | sed 's/.hap/#/') + hapID=$(basename ${asmList[i + 1]} .ragtagged.fa.gz | sed 's/.hap/#/') + refID=$(basename ${asmList[i]} .ragtagged.fa.gz | sed 's/.hap/#/') # Debug echo -e "[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID" -- GitLab From bffc5e642d18f395cbd04c7325cffb58c1041897 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 24 Jun 2024 15:48:31 +0200 Subject: [PATCH 30/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 8f845b9..3baf716 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -45,8 +45,8 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Setting filepaths for later bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam" syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" - hapID=$(basename ${asmList[i + 1]} .ragtagged.fa.gz | sed 's/.hap/#/') - refID=$(basename ${asmList[i]} .ragtagged.fa.gz | sed 's/.hap/#/') + #hapID=$(basename ${asmList[i + 1]} .ragtagged.fa.gz | sed 's/.hap/#/') + #refID=$(basename ${asmList[i]} .ragtagged.fa.gz | sed 's/.hap/#/') # Debug echo -e "[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID" @@ -58,8 +58,8 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do apptainer run $appdir/pan1c-env.sif python scripts/syri_pruning.py \ -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir - # Renaming chromosomes with same ids as the reference - sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) + # Renaming chromosomes with same ids as the reference (Not working) + #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) #Â Minimap2 genome vs genome alignment apptainer run --app minimap2 $appdir/PanGeTools.sif \ -- GitLab From 511e110b6289e05fa0629691b2c3fa557779ae0c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:36:56 +0200 Subject: [PATCH 31/86] Testing moving rules to dedicated files --- Snakefile | 235 ++++++++++++++++++++++++------------------------ rules/tools.smk | 16 ++++ 2 files changed, 133 insertions(+), 118 deletions(-) create mode 100644 rules/tools.smk diff --git a/Snakefile b/Snakefile index 839292a..052810c 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,8 @@ configfile: "config.yaml" +include: "rules/tools.smk" + + ## Modules import os import numpy as np @@ -7,15 +10,15 @@ import gzip import re """ - -Main variables used in the workflow - +Main variables used in the workflow --------------------------------------------------------------- """ -# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes + +# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes. +# File with .fa extension only will be used. (.fna will not be used) SAMPLES = np.unique([ - os.path.basename(f).split('.fa')[0] - for f in os.listdir("data/haplotypes/") - if re.search(r"\.fa", f) + os.path.basename(file).split('.fa')[0] + for file in os.listdir("data/haplotypes/") + if re.search(r"\.fa", file) ]) # Retrieving the list of haplotypes excluding the reference @@ -32,9 +35,10 @@ nHAP = len(SAMPLES) with gzip.open("data/haplotypes/"+config['reference'], "r") as handle: CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"] -# Adjusting the requested outputs according to the config file +# Adding optionnal output based on config.yaml, using the following function def which_analysis(): - # Creating a list with default analysis steps (to prevent the function from returning an empty list) + + ## Default analysis analysis_inputs = [ "output/stats/pan1c."+config['name']+".core.stats.tsv", # core stats expand("output/panacus.reports/"+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth @@ -42,33 +46,32 @@ def which_analysis(): "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics ] - # Optionals analysis steps + ## Optionals analysis steps if config["get_PAV"] == "True": # Adding PAV matrix creation analysis_inputs.append("output/pav.matrices") - #if config["get_allASM_SyRI"] == "True": # Creating the figure for all assemblies - # analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png") - if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures + + if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures analysis_inputs.append( - expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes + expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), ) - if config["run_Quast"] == "True": # Running quast on haplotypes + if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( "output/quast/Quast.report.html" ) - if config["get_contig_pos"] == "True": # Contig position in chromosomes figure + if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure analysis_inputs.append( expand("output/chr.contig/{haplotype}.contig.png", haplotype=CHRLIST) ) - if config["create_report"] == "True": # Create report - analysis_inputs.append( - "output/pan1c."+config['name']+".report.md" - ) - return analysis_inputs -""" + if config["create_report"] == "True": # Creating report (need contig) + analysis_inputs.append( + "output/pan1c."+config['name']+".report.md" + ) -Rules + return analysis_inputs +""" +Rules ------------------------------------------------------------------------------------------- """ # Main target rule rule all: @@ -77,29 +80,25 @@ rule all: "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line) which_analysis() -""" - -Tool rules - -""" -rule samtools_index: - # Samtools faidx to index compressed fasta - input: - fa="{sample}.fa.gz" - output: - "{sample}.fa.gz.fai", - "{sample}.fa.gz.gzi" - threads: 1 - params: - apppath=config["app.path"] - shell: - "apptainer run --app samtools {params.apppath}/PanGeTools.sif " - "faidx {input.fa}" +# """ +# Tool rules --------------------------------------------------------------------------------------- +# """ +# rule samtools_index: +# # Samtools faidx to index compressed fasta +# input: +# fa="{sample}.fa.gz" +# output: +# "{sample}.fa.gz.fai", +# "{sample}.fa.gz.gzi" +# threads: 1 +# params: +# app_path=config["app.path"] +# shell: +# "apptainer run --app samtools {params.app_path}/PanGeTools.sif " +# "faidx {input.fa}" """ - -Pre-processing section : preparing pggb inputs - +Pre-processing section : preparing pggb inputs --------------------------------------------------- """ rule ragtag_scaffolding: # Scaffold input haplotype against the reference to infer chromosome scale sequences @@ -114,12 +113,12 @@ rule ragtag_scaffolding: threads: 8 retries: 1 params: - apppath=config["app.path"] + app_path=config["app.path"] shell: """ bash scripts/ragtagChromInfer.sh \ -d $(dirname {output.fa})/{wildcards.haplotype} \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} \ -r {input.ref} \ -q {input.fa} \ @@ -137,14 +136,14 @@ rule quast_stats: fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), ref="data/haplotypes/"+config['reference'] output: - report="output/quast/Quast.report.html" - threads: workflow.cores//2 + report="output/quast/"+config['name']+".quast.report.html" + threads: 8 params: - apppath=config["app.path"], - panname=config["name"] + app_path=config["app.path"], + pan_name=config["name"] shell: """ - apptainer run {params.apppath}/pan1c-env.sif quast.py \ + apptainer run {params.app_path}/pan1c-env.sif quast.py \ -t {threads} \ -o "$(dirname {output.report})" \ -r {input.ref} \ @@ -156,9 +155,9 @@ rule quast_stats: {input.fas} # Compressing temporary files - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} $(dirname {output.report})/contigs_reports/*.fa - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* mv $(dirname {output.report})/report.html {output.report} @@ -174,7 +173,7 @@ rule contig_position: outdir=directory("output/chr.contig/{chromosome}") threads: 1 params: - apppath=config["app.path"] + app_path=config["app.path"] shell: """ mkdir {output.outdir} @@ -182,7 +181,7 @@ rule contig_position: hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') # Creating .bands file - apptainer run {params.apppath}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ + apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ -s 5 \ -f {input.fa} \ > $base/{wildcards.chromosome}.gff @@ -206,7 +205,7 @@ rule contig_position: #cat $base/{wildcards.chromosome}.tsv # Creating figure - apptainer run --app Renv {params.apppath}/pan1c-env.sif Rscript scripts/contig_position.R \ + apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ -b $base/{wildcards.chromosome}.bands \ -t $base/{wildcards.chromosome}.tsv \ -o {output.fig} @@ -220,15 +219,15 @@ rule clustering: expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) threads: workflow.cores params: - apppath=config["app.path"], - panname=config["name"] + app_path=config["app.path"], + pan_name=config["name"] shell: """ mkdir -p $(dirname {output[0]}) - apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \ - --fasta {input} --output $(dirname {output[0]}) --panname {params.panname} + apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ + --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} for file in $(dirname {output[0]})/*.fa; do - apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file + apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file done """ @@ -242,13 +241,13 @@ rule create_allAsm_syri_fig: wrkdir=directory('data/asm.syri/all') threads: 8 params: - apppath=config["app.path"] + app_path=config["app.path"] shell: """ mkdir {output.wrkdir} bash scripts/getSyriFigs.sh \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ @@ -270,12 +269,12 @@ rule create_sglAsm_syri_fig: resources: mem_gb=48 params: - apppath=config["app.path"] + app_path=config["app.path"] shell: """ mkdir -p {output.wrkdir} bash scripts/getSyriFigs.sh \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ @@ -298,7 +297,7 @@ rule create_pggb_input_syri_fig: wrkdir=directory('data/chrInput/syri/{chromosome}') threads: 8 params: - apppath=config["app.path"], + app_path=config["app.path"], ref=config['reference'] shell: """ @@ -318,7 +317,7 @@ rule create_pggb_input_syri_fig: done bash scripts/getSyriFigs.sh \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ @@ -338,7 +337,7 @@ rule wfmash_on_chr: aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") threads: 16 params: - apppath=config['app.path'], + app_path=config['app.path'], segment_length=config['wfmash.segment_length'], mapping_id=config['wfmash.mapping_id'], wfmash_sec=config['wfmash.secondary'] @@ -351,7 +350,7 @@ rule wfmash_on_chr: """ ## Mapping /usr/bin/time -v -o {log.time_map} \ - apptainer exec {params.apppath}/pggb.sif wfmash \ + apptainer exec {params.app_path}/pggb.sif wfmash \ -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ --tmp-base $(dirname {output.aln}) \ @@ -361,7 +360,7 @@ rule wfmash_on_chr: ## Aligning /usr/bin/time -v -o {log.time_aln} \ - apptainer exec {params.apppath}/pggb.sif wfmash \ + apptainer exec {params.app_path}/pggb.sif wfmash \ -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ --tmp-base $(dirname {output.aln}) \ @@ -371,10 +370,10 @@ rule wfmash_on_chr: 2> >(tee {log.cmd_aln} >&2) # Compressing - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output.mapping} - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output.aln} """ @@ -387,7 +386,7 @@ rule seqwish: temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") threads: 8 params: - apppath=config['app.path'], + app_path=config['app.path'], seqwish=config['seqwish.params'] log: cmd="logs/pggb/{chromosome}.seqwish.cmd.log", @@ -395,14 +394,14 @@ rule seqwish: shell: """ /usr/bin/time -v -o {log.time} \ - apptainer exec {params.apppath}/pggb.sif seqwish \ + apptainer exec {params.app_path}/pggb.sif seqwish \ -s {input.fa} -p {input.aln} -g {output} \ {params.seqwish} -t {threads} \ --temp-dir $(dirname {output}) -P 2>&1 | \ tee {log.cmd} # Compressing - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output} """ @@ -415,18 +414,18 @@ rule gfaffix_on_chr: transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" threads: 1 params: - apppath=config['app.path'] + app_path=config['app.path'] log: time="logs/pggb/{chromosome}.gfaffix.time.log" shell: """ /usr/bin/time -v -o {log.time} \ - apptainer exec {params.apppath}/pggb.sif gfaffix \ + apptainer exec {params.app_path}/pggb.sif gfaffix \ {input} -o {output.gfa} -t {output.transform} \ > /dev/null # Compressing - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output.gfa} """ @@ -438,7 +437,7 @@ rule odgi_postprocessing: gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' threads: 8 params: - apppath=config['app.path'] + app_path=config['app.path'] log: cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", time_build="logs/pggb/{chromosome}.odgi_build.time.log", @@ -454,26 +453,26 @@ rule odgi_postprocessing: ## Converting to odgi format and optimizing namespace /usr/bin/time -v -o {log.time_build} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ tee {log.cmd_build} ## Unchoping the nodes (merges unitigs into single nodes) /usr/bin/time -v -o {log.time_unchop} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ tee {log.cmd_unchop} ## Sorting the nodes /usr/bin/time -v -o {log.time_sort} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ tee {log.cmd_sort} ## Exporting to gfa /usr/bin/time -v -o {log.time_view} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ view -i $OGfile.unchoped.sorted.og -g \ 1> {output.gfa} \ 2> >(tee {log.cmd_view} >&2) @@ -483,12 +482,12 @@ rule odgi_postprocessing: ## Adding metadata python scripts/getTags.py \ - --appdir {params.apppath} --config-file config.yaml \ + --appdir {params.app_path} --config-file config.yaml \ > "$(dirname {input})/metadata.txt" sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} # Compressing - # apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ # -@ {threads} -k {output.gfa} """ @@ -513,12 +512,12 @@ rule graph_squeeze: "output/pan1c."+config['name']+".gfa" threads: 16 params: - apppath=config['app.path'] + app_path=config['app.path'] shell: """ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ squeeze -t {threads} -O -P -f {input.glist} -o {output}.og - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ view -t {threads} -P -i {output}.og -g > {output} rm {output}.og """ @@ -532,13 +531,13 @@ rule graph_stats: pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" threads: 4 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ - apptainer run --app gfastats {params.apppath}/PanGeTools.sif \ + apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ -g {input.graph} -P \ - -o $(dirname {output.genstats})/{params.panname}.{wildcards.chromosome} \ + -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ -t {threads} """ @@ -551,17 +550,17 @@ rule graph_figs: pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" threads: 4 params: - apppath=config['app.path'], + app_path=config['app.path'], oneDviz=config['odgi.1Dviz.params'], pcov=config['odgi.pcov.params'] shell: """ ## 1D viz - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P ## Pcov viz - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P """ @@ -573,13 +572,13 @@ rule aggregate_graphs_stats: genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ - apptainer run {params.apppath}/pan1c-env.sif python scripts/chrStatsAggregation.py \ + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ - --outputPaths {output.pathstats} --panname {params.panname} + --outputPaths {output.pathstats} --panname {params.pan_name} """ rule final_graph_tagging: @@ -590,12 +589,12 @@ rule final_graph_tagging: "output/pan1c."+config['name']+".gfa.metadata" threads: 1 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ python scripts/getTags.py \ - --appdir {params.apppath} --config-file config.yaml > {output} + --appdir {params.app_path} --config-file config.yaml > {output} sed -i '/^H/r {output}' {input.graph} """ @@ -606,12 +605,12 @@ rule pggb_input_stats: output: "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ - apptainer run {params.apppath}/pan1c-env.sif python scripts/chrInputStats.py \ - -f data/chrInputs/*.fa.gz -o {output} -p {params.panname} + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ + -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} """ rule core_statistics: @@ -623,14 +622,14 @@ rule core_statistics: tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", dir=directory("output/pggb.usage.figs") params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ mkdir -p {output.dir} - apptainer run {params.apppath}/pan1c-env.sif python scripts/coreStats.py \ + apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ - --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.panname} + --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} """ """ @@ -649,7 +648,7 @@ rule get_pav: directory("output/pav.matrices") threads: 16 params: - apppath=config['app.path'] + app_path=config['app.path'] run: shell("mkdir {output}") # Getting the list of graphs @@ -657,7 +656,7 @@ rule get_pav: graphList = [graph.rstrip("\n") for graph in handle.readlines()] # Iterating over graphs for graph in graphList: - shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}") + shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") rule panacus_stats: # Produces panacus reports for a chromosome graph @@ -666,8 +665,8 @@ rule panacus_stats: output: html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' params: - apppath=config['app.path'], - panname=config['name'], + app_path=config['app.path'], + pan_name=config['name'], refname=config['reference'] threads: 8 shell: @@ -677,7 +676,7 @@ rule panacus_stats: -r $(basename {params.refname} .fa.gz) \ -d data/chrGraphs/{wildcards.chromosome} \ -o {output.html} \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} """ @@ -692,11 +691,11 @@ rule create_pan1c_report_fig: reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" threads: 4 params: - apppath=config['app.path'] + app_path=config['app.path'] shell: """ ## Odgi 1D viz - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P ## Getting legend from contig figure @@ -726,7 +725,7 @@ rule create_pan1c_report: reportfig="output/pan1c."+config['name']+".report.md" threads: 4 params: - apppath=config['app.path'] + app_path=config['app.path'] run: shell("touch {output.reportfig}") diff --git a/rules/tools.smk b/rules/tools.smk new file mode 100644 index 0000000..7528154 --- /dev/null +++ b/rules/tools.smk @@ -0,0 +1,16 @@ +""" +Tool rules --------------------------------------------------------------------------------------- +""" +rule samtools_index: + # Samtools faidx to index compressed fasta + input: + fa="{sample}.fa.gz" + output: + "{sample}.fa.gz.fai", + "{sample}.fa.gz.gzi" + threads: 1 + params: + app_path=config["app.path"] + shell: + "apptainer run --app samtools {params.app_path}/PanGeTools.sif " + "faidx {input.fa}" \ No newline at end of file -- GitLab From 715361a591f020f8b5bf3c14378ea8def17685a5 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:40:18 +0200 Subject: [PATCH 32/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 052810c..a57f18d 100644 --- a/Snakefile +++ b/Snakefile @@ -56,7 +56,7 @@ def which_analysis(): ) if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( - "output/quast/Quast.report.html" + "output/quast/"+config['name']+"Quast.report.html" ) if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure analysis_inputs.append( -- GitLab From 8b78a0262b616a724a1b13074dcab1ac45515c0d Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:41:04 +0200 Subject: [PATCH 33/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index a57f18d..40bba47 100644 --- a/Snakefile +++ b/Snakefile @@ -56,7 +56,7 @@ def which_analysis(): ) if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( - "output/quast/"+config['name']+"Quast.report.html" + "output/quast/"+config['name']+".quast.report.html" ) if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure analysis_inputs.append( -- GitLab From 32673952cdc1dc79cf5c9e751c2768826906b767 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:47:15 +0200 Subject: [PATCH 34/86] Moved rules to dedicated SMK files --- Snakefile | 1294 +++++++++++++++++++------------------- rules/core.smk | 346 ++++++++++ rules/post-analysis.smk | 118 ++++ rules/pre-processing.smk | 185 ++++++ 4 files changed, 1295 insertions(+), 648 deletions(-) create mode 100644 rules/core.smk create mode 100644 rules/post-analysis.smk create mode 100644 rules/pre-processing.smk diff --git a/Snakefile b/Snakefile index 40bba47..0577e00 100644 --- a/Snakefile +++ b/Snakefile @@ -1,7 +1,9 @@ configfile: "config.yaml" include: "rules/tools.smk" - +include: "rules/pre-processing.smk" +include: "rules/core.smk" +include: "rules/post-analysis.smk" ## Modules import os @@ -97,661 +99,657 @@ rule all: # "apptainer run --app samtools {params.app_path}/PanGeTools.sif " # "faidx {input.fa}" -""" -Pre-processing section : preparing pggb inputs --------------------------------------------------- -""" -rule ragtag_scaffolding: - # Scaffold input haplotype against the reference to infer chromosome scale sequences - input: - ref="data/haplotypes/"+config['reference'], - reffai="data/haplotypes/"+config['reference']+".fai", - refgzi="data/haplotypes/"+config['reference']+".gzi", - fa="data/haplotypes/{haplotype}.fa.gz" - output: - fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", - tar="data/hap.ragtagged/{haplotype}.tar.gz" - threads: 8 - retries: 1 - params: - app_path=config["app.path"] - shell: - """ - bash scripts/ragtagChromInfer.sh \ - -d $(dirname {output.fa})/{wildcards.haplotype} \ - -a {params.app_path} \ - -t {threads} \ - -r {input.ref} \ - -q {input.fa} \ - -o {output.fa} - - #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then - # echo "Error : Empty final fasta" - # exit 1 - #fi - """ - -rule quast_stats: - # Run Quast on ragtagged genomes - input: - fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), - ref="data/haplotypes/"+config['reference'] - output: - report="output/quast/"+config['name']+".quast.report.html" - threads: 8 - params: - app_path=config["app.path"], - pan_name=config["name"] - shell: - """ - apptainer run {params.app_path}/pan1c-env.sif quast.py \ - -t {threads} \ - -o "$(dirname {output.report})" \ - -r {input.ref} \ - --plots-format png \ - --no-read-stats \ - --no-icarus \ - -s \ - --large \ - {input.fas} - - # Compressing temporary files - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} $(dirname {output.report})/contigs_reports/*.fa - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* - - mv $(dirname {output.report})/report.html {output.report} - """ - -rule contig_position: - # Produce figures with contig positions - input: - fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", - fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" - output: - fig="output/chr.contig/{chromosome}.contig.png", - outdir=directory("output/chr.contig/{chromosome}") - threads: 1 - params: - app_path=config["app.path"] - shell: - """ - mkdir {output.outdir} - base="{output.outdir}" - hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') +# """ +# Pre-processing section : preparing pggb inputs --------------------------------------------------- +# """ +# rule ragtag_scaffolding: +# # Scaffold input haplotype against the reference to infer chromosome scale sequences +# input: +# ref="data/haplotypes/"+config['reference'], +# reffai="data/haplotypes/"+config['reference']+".fai", +# refgzi="data/haplotypes/"+config['reference']+".gzi", +# fa="data/haplotypes/{haplotype}.fa.gz" +# output: +# fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", +# tar="data/hap.ragtagged/{haplotype}.tar.gz" +# threads: 8 +# retries: 1 +# params: +# app_path=config["app.path"] +# shell: +# """ +# bash scripts/ragtagChromInfer.sh \ +# -d $(dirname {output.fa})/{wildcards.haplotype} \ +# -a {params.app_path} \ +# -t {threads} \ +# -r {input.ref} \ +# -q {input.fa} \ +# -o {output.fa} + +# #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then +# # echo "Error : Empty final fasta" +# # exit 1 +# #fi +# """ + +# rule quast_stats: +# # Run Quast on ragtagged genomes +# input: +# fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), +# ref="data/haplotypes/"+config['reference'] +# output: +# report="output/quast/"+config['name']+".quast.report.html" +# threads: 8 +# params: +# app_path=config["app.path"], +# pan_name=config["name"] +# shell: +# """ +# apptainer run {params.app_path}/pan1c-env.sif quast.py \ +# -t {threads} \ +# -o "$(dirname {output.report})" \ +# -r {input.ref} \ +# --plots-format png \ +# --no-read-stats \ +# --no-icarus \ +# -s \ +# --large \ +# {input.fas} + +# # Compressing temporary files +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# -@ {threads} $(dirname {output.report})/contigs_reports/*.fa +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* + +# mv $(dirname {output.report})/report.html {output.report} +# """ + +# rule contig_position: +# # Produce figures with contig positions +# input: +# fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", +# fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" +# output: +# fig="output/chr.contig/{chromosome}.contig.png", +# outdir=directory("output/chr.contig/{chromosome}") +# threads: 1 +# params: +# app_path=config["app.path"] +# shell: +# """ +# mkdir {output.outdir} +# base="{output.outdir}" +# hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') - # Creating .bands file - apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ - -s 5 \ - -f {input.fa} \ - > $base/{wildcards.chromosome}.gff - - #cat $base/{wildcards.chromosome}.gff - - awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ - sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ - > $base/{wildcards.chromosome}.bands - - #cat $base/{wildcards.chromosome}.bands - - # Creating bed like file - cat {input.fai} | \ - cut -f1,2 | tr '\\t' ',' | \ - sed "s/,/,1,/" | \ - sed "1i chr\\tstart\\tend" | \ - tr ',' '\\t' \ - > $base/{wildcards.chromosome}.tsv - - #cat $base/{wildcards.chromosome}.tsv - - # Creating figure - apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ - -b $base/{wildcards.chromosome}.bands \ - -t $base/{wildcards.chromosome}.tsv \ - -o {output.fig} - """ - -rule clustering: - # Read ragtagged fastas and split chromosome sequences into according bins - input: - expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) - output: - expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) - threads: workflow.cores - params: - app_path=config["app.path"], - pan_name=config["name"] - shell: - """ - mkdir -p $(dirname {output[0]}) - apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ - --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} - for file in $(dirname {output[0]})/*.fa; do - apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file - done - """ - -rule create_allAsm_syri_fig: - # Create a SyRI figure containing all input assemblies - input: - ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", - qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) - output: - fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", - wrkdir=directory('data/asm.syri/all') - threads: 8 - params: - app_path=config["app.path"] - shell: - """ - mkdir {output.wrkdir} - - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {input.ref} \ - -q "{input.qry}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - """ - -rule create_sglAsm_syri_fig: - # Create a SyRI figure for a single input assembly - input: - ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", - qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" - output: - fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", - wrkdir=directory('data/asm.syri/{haplotype}') - threads: 4 - retries: 1 - resources: - mem_gb=48 - params: - app_path=config["app.path"] - shell: - """ - mkdir -p {output.wrkdir} - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {input.ref} \ - -q "{input.qry}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - """ - -""" +# # Creating .bands file +# apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ +# -s 5 \ +# -f {input.fa} \ +# > $base/{wildcards.chromosome}.gff + +# #cat $base/{wildcards.chromosome}.gff + +# awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ +# sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ +# > $base/{wildcards.chromosome}.bands + +# #cat $base/{wildcards.chromosome}.bands + +# # Creating bed like file +# cat {input.fai} | \ +# cut -f1,2 | tr '\\t' ',' | \ +# sed "s/,/,1,/" | \ +# sed "1i chr\\tstart\\tend" | \ +# tr ',' '\\t' \ +# > $base/{wildcards.chromosome}.tsv + +# #cat $base/{wildcards.chromosome}.tsv + +# # Creating figure +# apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ +# -b $base/{wildcards.chromosome}.bands \ +# -t $base/{wildcards.chromosome}.tsv \ +# -o {output.fig} +# """ + +# rule clustering: +# # Read ragtagged fastas and split chromosome sequences into according bins +# input: +# expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) +# output: +# expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) +# threads: workflow.cores +# params: +# app_path=config["app.path"], +# pan_name=config["name"] +# shell: +# """ +# mkdir -p $(dirname {output[0]}) +# apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ +# --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} +# for file in $(dirname {output[0]})/*.fa; do +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file +# done +# """ + +# rule create_allAsm_syri_fig: +# # Create a SyRI figure containing all input assemblies +# input: +# ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", +# qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) +# output: +# fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", +# wrkdir=directory('data/asm.syri/all') +# threads: 8 +# params: +# app_path=config["app.path"] +# shell: +# """ +# mkdir {output.wrkdir} + +# bash scripts/getSyriFigs.sh \ +# -a {params.app_path} \ +# -t {threads} \ +# -d {output.wrkdir} \ +# -o $(basename {output.fig}) \ +# -r {input.ref} \ +# -q "{input.qry}" +# mv {output.wrkdir}/$(basename {output.fig}) {output.fig} +# """ + +# rule create_sglAsm_syri_fig: +# # Create a SyRI figure for a single input assembly +# input: +# ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", +# qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" +# output: +# fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", +# wrkdir=directory('data/asm.syri/{haplotype}') +# threads: 4 +# retries: 1 +# resources: +# mem_gb=48 +# params: +# app_path=config["app.path"] +# shell: +# """ +# mkdir -p {output.wrkdir} +# bash scripts/getSyriFigs.sh \ +# -a {params.app_path} \ +# -t {threads} \ +# -d {output.wrkdir} \ +# -o $(basename {output.fig}) \ +# -r {input.ref} \ +# -q "{input.qry}" +# mv {output.wrkdir}/$(basename {output.fig}) {output.fig} +# """ -Core section : Running PGGB +# """ +# Core section : Running PGGB +# """ -""" +# rule create_pggb_input_syri_fig: +# input: +# fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' +# output: +# fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", +# wrkdir=directory('data/chrInput/syri/{chromosome}') +# threads: 8 +# params: +# app_path=config["app.path"], +# ref=config['reference'] +# shell: +# """ +# mkdir {output.wrkdir} +# refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) -rule create_pggb_input_syri_fig: - input: - fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' - output: - fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", - wrkdir=directory('data/chrInput/syri/{chromosome}') - threads: 8 - params: - app_path=config["app.path"], - ref=config['reference'] - shell: - """ - mkdir {output.wrkdir} - refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) - - ## Creating single fasta from multifasta - zcat {input.fasta} | awk -F"#" \ - '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' +# ## Creating single fasta from multifasta +# zcat {input.fasta} | awk -F"#" \ +# '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' - ## Getting the list of sequences - asmList=() - for file in {output.wrkdir}/*.fa; do - asm="$(basename $file .fa | cut -f1 -d"#").fa" - mv $file "$(dirname $file)/$asm" - asmList+=("$asm") - done - - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {output.wrkdir}/"${refname}.fa" \ - -q "${asmList[@]}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - rm {output.wrkdir}/*.fa - """ - -rule wfmash_on_chr: - # Run wfmash on a specific chromosome input - input: - fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', - fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' - output: - mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), - aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") - threads: 16 - params: - app_path=config['app.path'], - segment_length=config['wfmash.segment_length'], - mapping_id=config['wfmash.mapping_id'], - wfmash_sec=config['wfmash.secondary'] - log: - cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", - time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", - cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", - time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", - shell: - """ - ## Mapping - /usr/bin/time -v -o {log.time_map} \ - apptainer exec {params.app_path}/pggb.sif wfmash \ - -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ - -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ - --tmp-base $(dirname {output.aln}) \ - {input.fa} --approx-map \ - 1> {output.mapping} \ - 2> >(tee {log.cmd_map} >&2) - - ## Aligning - /usr/bin/time -v -o {log.time_aln} \ - apptainer exec {params.app_path}/pggb.sif wfmash \ - -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ - -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ - --tmp-base $(dirname {output.aln}) \ - {input.fa} -i {output.mapping} \ - --invert-filtering \ - 1> {output.aln} \ - 2> >(tee {log.cmd_aln} >&2) - - # Compressing - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output.mapping} - - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output.aln} - """ - -rule seqwish: - # Run seqwish on alignement produced by wfmash - input: - fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', - aln=rules.wfmash_on_chr.output.aln - output: - temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") - threads: 8 - params: - app_path=config['app.path'], - seqwish=config['seqwish.params'] - log: - cmd="logs/pggb/{chromosome}.seqwish.cmd.log", - time="logs/pggb/{chromosome}.seqwish.time.log" - shell: - """ - /usr/bin/time -v -o {log.time} \ - apptainer exec {params.app_path}/pggb.sif seqwish \ - -s {input.fa} -p {input.aln} -g {output} \ - {params.seqwish} -t {threads} \ - --temp-dir $(dirname {output}) -P 2>&1 | \ - tee {log.cmd} - - # Compressing - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output} - """ - -rule gfaffix_on_chr: - # Run gfaffix on seqwish graph - input: - rules.seqwish.output - output: - gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), - transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" - threads: 1 - params: - app_path=config['app.path'] - log: - time="logs/pggb/{chromosome}.gfaffix.time.log" - shell: - """ - /usr/bin/time -v -o {log.time} \ - apptainer exec {params.app_path}/pggb.sif gfaffix \ - {input} -o {output.gfa} -t {output.transform} \ - > /dev/null - - # Compressing - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output.gfa} - """ - -rule odgi_postprocessing: - # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph - input: - rules.gfaffix_on_chr.output.gfa - output: - gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - threads: 8 - params: - app_path=config['app.path'] - log: - cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", - time_build="logs/pggb/{chromosome}.odgi_build.time.log", - cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", - time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", - cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", - time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", - cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", - time_view="logs/pggb/{chromosome}.odgi_view.time.log" - shell: - """ - OGfile="$(dirname {input})/$(basename {input} .gfa)" - - ## Converting to odgi format and optimizing namespace - /usr/bin/time -v -o {log.time_build} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ - tee {log.cmd_build} +# ## Getting the list of sequences +# asmList=() +# for file in {output.wrkdir}/*.fa; do +# asm="$(basename $file .fa | cut -f1 -d"#").fa" +# mv $file "$(dirname $file)/$asm" +# asmList+=("$asm") +# done + +# bash scripts/getSyriFigs.sh \ +# -a {params.app_path} \ +# -t {threads} \ +# -d {output.wrkdir} \ +# -o $(basename {output.fig}) \ +# -r {output.wrkdir}/"${refname}.fa" \ +# -q "${asmList[@]}" +# mv {output.wrkdir}/$(basename {output.fig}) {output.fig} +# rm {output.wrkdir}/*.fa +# """ + +# rule wfmash_on_chr: +# # Run wfmash on a specific chromosome input +# input: +# fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', +# fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' +# output: +# mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), +# aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") +# threads: 16 +# params: +# app_path=config['app.path'], +# segment_length=config['wfmash.segment_length'], +# mapping_id=config['wfmash.mapping_id'], +# wfmash_sec=config['wfmash.secondary'] +# log: +# cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", +# time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", +# cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", +# time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", +# shell: +# """ +# ## Mapping +# /usr/bin/time -v -o {log.time_map} \ +# apptainer exec {params.app_path}/pggb.sif wfmash \ +# -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ +# -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ +# --tmp-base $(dirname {output.aln}) \ +# {input.fa} --approx-map \ +# 1> {output.mapping} \ +# 2> >(tee {log.cmd_map} >&2) + +# ## Aligning +# /usr/bin/time -v -o {log.time_aln} \ +# apptainer exec {params.app_path}/pggb.sif wfmash \ +# -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ +# -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ +# --tmp-base $(dirname {output.aln}) \ +# {input.fa} -i {output.mapping} \ +# --invert-filtering \ +# 1> {output.aln} \ +# 2> >(tee {log.cmd_aln} >&2) + +# # Compressing +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# -@ {threads} -k {output.mapping} + +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# -@ {threads} -k {output.aln} +# """ + +# rule seqwish: +# # Run seqwish on alignement produced by wfmash +# input: +# fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', +# aln=rules.wfmash_on_chr.output.aln +# output: +# temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") +# threads: 8 +# params: +# app_path=config['app.path'], +# seqwish=config['seqwish.params'] +# log: +# cmd="logs/pggb/{chromosome}.seqwish.cmd.log", +# time="logs/pggb/{chromosome}.seqwish.time.log" +# shell: +# """ +# /usr/bin/time -v -o {log.time} \ +# apptainer exec {params.app_path}/pggb.sif seqwish \ +# -s {input.fa} -p {input.aln} -g {output} \ +# {params.seqwish} -t {threads} \ +# --temp-dir $(dirname {output}) -P 2>&1 | \ +# tee {log.cmd} + +# # Compressing +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# -@ {threads} -k {output} +# """ + +# rule gfaffix_on_chr: +# # Run gfaffix on seqwish graph +# input: +# rules.seqwish.output +# output: +# gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), +# transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" +# threads: 1 +# params: +# app_path=config['app.path'] +# log: +# time="logs/pggb/{chromosome}.gfaffix.time.log" +# shell: +# """ +# /usr/bin/time -v -o {log.time} \ +# apptainer exec {params.app_path}/pggb.sif gfaffix \ +# {input} -o {output.gfa} -t {output.transform} \ +# > /dev/null + +# # Compressing +# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# -@ {threads} -k {output.gfa} +# """ + +# rule odgi_postprocessing: +# # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph +# input: +# rules.gfaffix_on_chr.output.gfa +# output: +# gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' +# threads: 8 +# params: +# app_path=config['app.path'] +# log: +# cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", +# time_build="logs/pggb/{chromosome}.odgi_build.time.log", +# cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", +# time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", +# cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", +# time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", +# cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", +# time_view="logs/pggb/{chromosome}.odgi_view.time.log" +# shell: +# """ +# OGfile="$(dirname {input})/$(basename {input} .gfa)" + +# ## Converting to odgi format and optimizing namespace +# /usr/bin/time -v -o {log.time_build} \ +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ +# tee {log.cmd_build} - ## Unchoping the nodes (merges unitigs into single nodes) - /usr/bin/time -v -o {log.time_unchop} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ - tee {log.cmd_unchop} - - ## Sorting the nodes - /usr/bin/time -v -o {log.time_sort} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ - -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ - tee {log.cmd_sort} - - ## Exporting to gfa - /usr/bin/time -v -o {log.time_view} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - view -i $OGfile.unchoped.sorted.og -g \ - 1> {output.gfa} \ - 2> >(tee {log.cmd_view} >&2) - - ## Removing .og files for space savings - rm $(dirname {input})/*.og - - ## Adding metadata - python scripts/getTags.py \ - --appdir {params.app_path} --config-file config.yaml \ - > "$(dirname {input})/metadata.txt" - sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} - - # Compressing - # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - # -@ {threads} -k {output.gfa} - """ - -rule generate_graph_list: - # Generate a text file containing all created graphs - input: - expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) - output: - "data/chrGraphs/graphsList.txt" - threads: 1 - run: - with open(output[0], "w") as handle: - for file in input: - handle.write(file+"\n") - -rule graph_squeeze: - # Using odgi to merge every subgraphs into a final one - input: - glist="data/chrGraphs/graphsList.txt", - graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) - output: - "output/pan1c."+config['name']+".gfa" - threads: 16 - params: - app_path=config['app.path'] - shell: - """ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - squeeze -t {threads} -O -P -f {input.glist} -o {output}.og - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - view -t {threads} -P -i {output}.og -g > {output} - rm {output}.og - """ - -rule graph_stats: - # Using GFAstats to produce stats on every chromosome graphs - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - output: - genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", - pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" - threads: 4 - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ - -g {input.graph} -P \ - -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ - -t {threads} - """ - -rule graph_figs: - # Creating figures using odgi viz - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - output: - oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", - pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" - threads: 4 - params: - app_path=config['app.path'], - oneDviz=config['odgi.1Dviz.params'], - pcov=config['odgi.pcov.params'] - shell: - """ - ## 1D viz - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P - - ## Pcov viz - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P - """ - -rule aggregate_graphs_stats: - # Reading and merging all stats files from chromosome graphs into a .tsv. - input: - genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) - output: - genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ - --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ - --outputPaths {output.pathstats} --panname {params.pan_name} - """ - -rule final_graph_tagging: - # Add metadata to the final GFA - input: - graph="output/pan1c."+config['name']+".gfa", - output: - "output/pan1c."+config['name']+".gfa.metadata" - threads: 1 - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - python scripts/getTags.py \ - --appdir {params.app_path} --config-file config.yaml > {output} - sed -i '/^H/r {output}' {input.graph} - """ - -rule pggb_input_stats: - # Produces statistics on pggb input sequences - input: - flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" - output: - "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ - -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} - """ - -rule core_statistics: - # Aggregate chrInput, chrGraph and pggb statistics into a single tsv - input: - chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", - chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" - output: - tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", - dir=directory("output/pggb.usage.figs") - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - mkdir -p {output.dir} - apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ - --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ - --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} - """ +# ## Unchoping the nodes (merges unitigs into single nodes) +# /usr/bin/time -v -o {log.time_unchop} \ +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ +# tee {log.cmd_unchop} + +# ## Sorting the nodes +# /usr/bin/time -v -o {log.time_sort} \ +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ +# -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ +# tee {log.cmd_sort} + +# ## Exporting to gfa +# /usr/bin/time -v -o {log.time_view} \ +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# view -i $OGfile.unchoped.sorted.og -g \ +# 1> {output.gfa} \ +# 2> >(tee {log.cmd_view} >&2) + +# ## Removing .og files for space savings +# rm $(dirname {input})/*.og + +# ## Adding metadata +# python scripts/getTags.py \ +# --appdir {params.app_path} --config-file config.yaml \ +# > "$(dirname {input})/metadata.txt" +# sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} + +# # Compressing +# # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ +# # -@ {threads} -k {output.gfa} +# """ + +# rule generate_graph_list: +# # Generate a text file containing all created graphs +# input: +# expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) +# output: +# "data/chrGraphs/graphsList.txt" +# threads: 1 +# run: +# with open(output[0], "w") as handle: +# for file in input: +# handle.write(file+"\n") -""" +# rule graph_squeeze: +# # Using odgi to merge every subgraphs into a final one +# input: +# glist="data/chrGraphs/graphsList.txt", +# graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) +# output: +# "output/pan1c."+config['name']+".gfa" +# threads: 16 +# params: +# app_path=config['app.path'] +# shell: +# """ +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# squeeze -t {threads} -O -P -f {input.glist} -o {output}.og +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# view -t {threads} -P -i {output}.og -g > {output} +# rm {output}.og +# """ + +# rule graph_stats: +# # Using GFAstats to produce stats on every chromosome graphs +# input: +# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' +# output: +# genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", +# pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" +# threads: 4 +# params: +# app_path=config['app.path'], +# pan_name=config['name'] +# shell: +# """ +# apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ +# -g {input.graph} -P \ +# -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ +# -t {threads} +# """ + +# rule graph_figs: +# # Creating figures using odgi viz +# input: +# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' +# output: +# oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", +# pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" +# threads: 4 +# params: +# app_path=config['app.path'], +# oneDviz=config['odgi.1Dviz.params'], +# pcov=config['odgi.pcov.params'] +# shell: +# """ +# ## 1D viz +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P + +# ## Pcov viz +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P +# """ + +# rule aggregate_graphs_stats: +# # Reading and merging all stats files from chromosome graphs into a .tsv. +# input: +# genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) +# output: +# genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", +# pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" +# params: +# app_path=config['app.path'], +# pan_name=config['name'] +# shell: +# """ +# apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ +# --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ +# --outputPaths {output.pathstats} --panname {params.pan_name} +# """ + +# rule final_graph_tagging: +# # Add metadata to the final GFA +# input: +# graph="output/pan1c."+config['name']+".gfa", +# output: +# "output/pan1c."+config['name']+".gfa.metadata" +# threads: 1 +# params: +# app_path=config['app.path'], +# pan_name=config['name'] +# shell: +# """ +# python scripts/getTags.py \ +# --appdir {params.app_path} --config-file config.yaml > {output} +# sed -i '/^H/r {output}' {input.graph} +# """ + +# rule pggb_input_stats: +# # Produces statistics on pggb input sequences +# input: +# flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" +# output: +# "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" +# params: +# app_path=config['app.path'], +# pan_name=config['name'] +# shell: +# """ +# apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ +# -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} +# """ -Post-processing section : - The graph for each chromosome are made as well as some basic statistics. - In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...). - It also contains rules to use the graph itself. +# rule core_statistics: +# # Aggregate chrInput, chrGraph and pggb statistics into a single tsv +# input: +# chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", +# chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" +# output: +# tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", +# dir=directory("output/pggb.usage.figs") +# params: +# app_path=config['app.path'], +# pan_name=config['name'] +# shell: +# """ +# mkdir -p {output.dir} +# apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ +# --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ +# --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} +# """ -""" -rule get_pav: - # Create PAV matrix readable by panache for a given chromosome scale graph - input: - "data/chrGraphs/graphsList.txt" - output: - directory("output/pav.matrices") - threads: 16 - params: - app_path=config['app.path'] - run: - shell("mkdir {output}") - # Getting the list of graphs - with open(input[0]) as handle: - graphList = [graph.rstrip("\n") for graph in handle.readlines()] - # Iterating over graphs - for graph in graphList: - shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") - -rule panacus_stats: - # Produces panacus reports for a chromosome graph - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - output: - html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' - params: - app_path=config['app.path'], - pan_name=config['name'], - refname=config['reference'] - threads: 8 - shell: - """ - bash scripts/getPanacusHG.sh \ - -g {input.graph} \ - -r $(basename {params.refname} .fa.gz) \ - -d data/chrGraphs/{wildcards.chromosome} \ - -o {output.html} \ - -a {params.app_path} \ - -t {threads} - """ - -rule create_pan1c_report_fig: - # Produces a markdown report figure of chromosomes graphs - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', - contigfig="output/chr.contig/{chromosome}.contig.png", - output: - odgifig=temp("tmp/{chromosome}.odgi.png"), - namefig=temp("tmp/{chromosome}.name.png"), - reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" - threads: 4 - params: - app_path=config['app.path'] - shell: - """ - ## Odgi 1D viz - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P - - ## Getting legend from contig figure - convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} - - odgheight=$(identify -ping -format '%h' {output.odgifig}) - ctgheight=$(identify -ping -format '%h' {input.contigfig}) - - combinedheight=$(($odgheight+$ctgheight)) - - echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" - - ## Creating empty canvas ad adding other figures to it - convert -size "3300x$combinedheight" xc:white {output.reportfig} - composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} - composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} - composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} - """ - -rule create_pan1c_report: - # Produces a markdown report of chromosomes graphs - input: - figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), - genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" - output: - reportfig="output/pan1c."+config['name']+".report.md" - threads: 4 - params: - app_path=config['app.path'] - run: - shell("touch {output.reportfig}") - - # Adding Summary - ## To Do - - # Adding General stats - shell("echo '# General stats' >> {output.reportfig}") - shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - - # Adding Path stats - shell("echo '# Path stats' >> {output.reportfig}") - shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - - # Adding chromosomes figures - fig_list = [fig for fig in input.figs] - print(fig_list) - fig_list.sort() +# """ +# Post-processing section : +# The graph for each chromosome are made as well as some basic statistics. +# In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...). +# It also contains rules to use the graph itself. +# """ +# rule get_pav: +# # Create PAV matrix readable by panache for a given chromosome scale graph +# input: +# "data/chrGraphs/graphsList.txt" +# output: +# directory("output/pav.matrices") +# threads: 16 +# params: +# app_path=config['app.path'] +# run: +# shell("mkdir {output}") +# # Getting the list of graphs +# with open(input[0]) as handle: +# graphList = [graph.rstrip("\n") for graph in handle.readlines()] +# # Iterating over graphs +# for graph in graphList: +# shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") + +# rule panacus_stats: +# # Produces panacus reports for a chromosome graph +# input: +# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' +# output: +# html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' +# params: +# app_path=config['app.path'], +# pan_name=config['name'], +# refname=config['reference'] +# threads: 8 +# shell: +# """ +# bash scripts/getPanacusHG.sh \ +# -g {input.graph} \ +# -r $(basename {params.refname} .fa.gz) \ +# -d data/chrGraphs/{wildcards.chromosome} \ +# -o {output.html} \ +# -a {params.app_path} \ +# -t {threads} +# """ + +# rule create_pan1c_report_fig: +# # Produces a markdown report figure of chromosomes graphs +# input: +# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', +# contigfig="output/chr.contig/{chromosome}.contig.png", +# output: +# odgifig=temp("tmp/{chromosome}.odgi.png"), +# namefig=temp("tmp/{chromosome}.name.png"), +# reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" +# threads: 4 +# params: +# app_path=config['app.path'] +# shell: +# """ +# ## Odgi 1D viz +# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ +# viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P + +# ## Getting legend from contig figure +# convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} + +# odgheight=$(identify -ping -format '%h' {output.odgifig}) +# ctgheight=$(identify -ping -format '%h' {input.contigfig}) + +# combinedheight=$(($odgheight+$ctgheight)) + +# echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" + +# ## Creating empty canvas ad adding other figures to it +# convert -size "3300x$combinedheight" xc:white {output.reportfig} +# composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} +# composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} +# composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} +# """ + +# rule create_pan1c_report: +# # Produces a markdown report of chromosomes graphs +# input: +# figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), +# genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", +# pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" +# output: +# reportfig="output/pan1c."+config['name']+".report.md" +# threads: 4 +# params: +# app_path=config['app.path'] +# run: +# shell("touch {output.reportfig}") + +# # Adding Summary +# ## To Do + +# # Adding General stats +# shell("echo '# General stats' >> {output.reportfig}") +# shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") +# shell("echo '' >> {output.reportfig}") + +# # Adding Path stats +# shell("echo '# Path stats' >> {output.reportfig}") +# shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") +# shell("echo '' >> {output.reportfig}") + +# # Adding chromosomes figures +# fig_list = [fig for fig in input.figs] +# print(fig_list) +# fig_list.sort() - shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") - for fig in fig_list: - basename=os.path.basename(fig) - chr_name=basename.split('.')[1] +# shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") +# for fig in fig_list: +# basename=os.path.basename(fig) +# chr_name=basename.split('.')[1] - shell("echo '## {chr_name}' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") \ No newline at end of file +# shell("echo '## {chr_name}' >> {output.reportfig}") +# shell("echo '' >> {output.reportfig}") +# shell("echo '' >> {output.reportfig}") \ No newline at end of file diff --git a/rules/core.smk b/rules/core.smk new file mode 100644 index 0000000..5473943 --- /dev/null +++ b/rules/core.smk @@ -0,0 +1,346 @@ +""" +Core section : Running PGGB on each chromosome and generating whole pangenome ------------------- +""" + +rule create_pggb_input_syri_fig: + input: + fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' + output: + fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", + wrkdir=directory('data/chrInput/syri/{chromosome}') + threads: 8 + params: + app_path=config["app.path"], + ref=config['reference'] + shell: + """ + mkdir {output.wrkdir} + refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) + + ## Creating single fasta from multifasta + zcat {input.fasta} | awk -F"#" \ + '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' + + ## Getting the list of sequences + asmList=() + for file in {output.wrkdir}/*.fa; do + asm="$(basename $file .fa | cut -f1 -d"#").fa" + mv $file "$(dirname $file)/$asm" + asmList+=("$asm") + done + + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ + -t {threads} \ + -d {output.wrkdir} \ + -o $(basename {output.fig}) \ + -r {output.wrkdir}/"${refname}.fa" \ + -q "${asmList[@]}" + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} + rm {output.wrkdir}/*.fa + """ + +rule wfmash_on_chr: + # Run wfmash on a specific chromosome input + input: + fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', + fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' + output: + mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), + aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") + threads: 16 + params: + app_path=config['app.path'], + segment_length=config['wfmash.segment_length'], + mapping_id=config['wfmash.mapping_id'], + wfmash_sec=config['wfmash.secondary'] + log: + cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", + time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", + cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", + time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", + shell: + """ + ## Mapping + /usr/bin/time -v -o {log.time_map} \ + apptainer exec {params.app_path}/pggb.sif wfmash \ + -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ + -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ + --tmp-base $(dirname {output.aln}) \ + {input.fa} --approx-map \ + 1> {output.mapping} \ + 2> >(tee {log.cmd_map} >&2) + + ## Aligning + /usr/bin/time -v -o {log.time_aln} \ + apptainer exec {params.app_path}/pggb.sif wfmash \ + -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ + -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ + --tmp-base $(dirname {output.aln}) \ + {input.fa} -i {output.mapping} \ + --invert-filtering \ + 1> {output.aln} \ + 2> >(tee {log.cmd_aln} >&2) + + # Compressing + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output.mapping} + + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output.aln} + """ + +rule seqwish: + # Run seqwish on alignement produced by wfmash + input: + fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', + aln=rules.wfmash_on_chr.output.aln + output: + temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") + threads: 8 + params: + app_path=config['app.path'], + seqwish=config['seqwish.params'] + log: + cmd="logs/pggb/{chromosome}.seqwish.cmd.log", + time="logs/pggb/{chromosome}.seqwish.time.log" + shell: + """ + /usr/bin/time -v -o {log.time} \ + apptainer exec {params.app_path}/pggb.sif seqwish \ + -s {input.fa} -p {input.aln} -g {output} \ + {params.seqwish} -t {threads} \ + --temp-dir $(dirname {output}) -P 2>&1 | \ + tee {log.cmd} + + # Compressing + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output} + """ + +rule gfaffix_on_chr: + # Run gfaffix on seqwish graph + input: + rules.seqwish.output + output: + gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), + transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" + threads: 1 + params: + app_path=config['app.path'] + log: + time="logs/pggb/{chromosome}.gfaffix.time.log" + shell: + """ + /usr/bin/time -v -o {log.time} \ + apptainer exec {params.app_path}/pggb.sif gfaffix \ + {input} -o {output.gfa} -t {output.transform} \ + > /dev/null + + # Compressing + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output.gfa} + """ + +rule odgi_postprocessing: + # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph + input: + rules.gfaffix_on_chr.output.gfa + output: + gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + threads: 8 + params: + app_path=config['app.path'] + log: + cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", + time_build="logs/pggb/{chromosome}.odgi_build.time.log", + cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", + time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", + cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", + time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", + cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", + time_view="logs/pggb/{chromosome}.odgi_view.time.log" + shell: + """ + OGfile="$(dirname {input})/$(basename {input} .gfa)" + + ## Converting to odgi format and optimizing namespace + /usr/bin/time -v -o {log.time_build} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ + tee {log.cmd_build} + + ## Unchoping the nodes (merges unitigs into single nodes) + /usr/bin/time -v -o {log.time_unchop} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ + tee {log.cmd_unchop} + + ## Sorting the nodes + /usr/bin/time -v -o {log.time_sort} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ + -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ + tee {log.cmd_sort} + + ## Exporting to gfa + /usr/bin/time -v -o {log.time_view} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + view -i $OGfile.unchoped.sorted.og -g \ + 1> {output.gfa} \ + 2> >(tee {log.cmd_view} >&2) + + ## Removing .og files for space savings + rm $(dirname {input})/*.og + + ## Adding metadata + python scripts/getTags.py \ + --appdir {params.app_path} --config-file config.yaml \ + > "$(dirname {input})/metadata.txt" + sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} + + # Compressing + # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + # -@ {threads} -k {output.gfa} + """ + +rule generate_graph_list: + # Generate a text file containing all created graphs + input: + expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) + output: + "data/chrGraphs/graphsList.txt" + threads: 1 + run: + with open(output[0], "w") as handle: + for file in input: + handle.write(file+"\n") + +rule graph_squeeze: + # Using odgi to merge every subgraphs into a final one + input: + glist="data/chrGraphs/graphsList.txt", + graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) + output: + "output/pan1c."+config['name']+".gfa" + threads: 16 + params: + app_path=config['app.path'] + shell: + """ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + squeeze -t {threads} -O -P -f {input.glist} -o {output}.og + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + view -t {threads} -P -i {output}.og -g > {output} + rm {output}.og + """ + +rule graph_stats: + # Using GFAstats to produce stats on every chromosome graphs + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + output: + genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", + pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" + threads: 4 + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ + -g {input.graph} -P \ + -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ + -t {threads} + """ + +rule graph_figs: + # Creating figures using odgi viz + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + output: + oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", + pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" + threads: 4 + params: + app_path=config['app.path'], + oneDviz=config['odgi.1Dviz.params'], + pcov=config['odgi.pcov.params'] + shell: + """ + ## 1D viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P + + ## Pcov viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P + """ + +rule aggregate_graphs_stats: + # Reading and merging all stats files from chromosome graphs into a .tsv. + input: + genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) + output: + genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ + --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ + --outputPaths {output.pathstats} --panname {params.pan_name} + """ + +rule final_graph_tagging: + # Add metadata to the final GFA + input: + graph="output/pan1c."+config['name']+".gfa", + output: + "output/pan1c."+config['name']+".gfa.metadata" + threads: 1 + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + python scripts/getTags.py \ + --appdir {params.app_path} --config-file config.yaml > {output} + sed -i '/^H/r {output}' {input.graph} + """ + +rule pggb_input_stats: + # Produces statistics on pggb input sequences + input: + flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + output: + "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ + -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} + """ + +rule core_statistics: + # Aggregate chrInput, chrGraph and pggb statistics into a single tsv + input: + chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", + chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + output: + tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", + dir=directory("output/pggb.usage.figs") + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + mkdir -p {output.dir} + apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ + --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ + --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} + """ \ No newline at end of file diff --git a/rules/post-analysis.smk b/rules/post-analysis.smk new file mode 100644 index 0000000..3041501 --- /dev/null +++ b/rules/post-analysis.smk @@ -0,0 +1,118 @@ +""" +Post-analysis section --------------------------------------------------------------------------- +""" +rule get_pav: + # Create PAV matrix readable by panache for a given chromosome scale graph + input: + "data/chrGraphs/graphsList.txt" + output: + directory("output/pav.matrices") + threads: 16 + params: + app_path=config['app.path'] + run: + shell("mkdir {output}") + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: + shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") + +rule panacus_stats: + # Produces panacus reports for a chromosome graph + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + output: + html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' + params: + app_path=config['app.path'], + pan_name=config['name'], + refname=config['reference'] + threads: 8 + shell: + """ + bash scripts/getPanacusHG.sh \ + -g {input.graph} \ + -r $(basename {params.refname} .fa.gz) \ + -d data/chrGraphs/{wildcards.chromosome} \ + -o {output.html} \ + -a {params.app_path} \ + -t {threads} + """ + +rule create_pan1c_report_fig: + # Produces a markdown report figure of chromosomes graphs + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', + contigfig="output/chr.contig/{chromosome}.contig.png", + output: + odgifig=temp("tmp/{chromosome}.odgi.png"), + namefig=temp("tmp/{chromosome}.name.png"), + reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" + threads: 4 + params: + app_path=config['app.path'] + shell: + """ + ## Odgi 1D viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P + + ## Getting legend from contig figure + convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} + + odgheight=$(identify -ping -format '%h' {output.odgifig}) + ctgheight=$(identify -ping -format '%h' {input.contigfig}) + + combinedheight=$(($odgheight+$ctgheight)) + + echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" + + ## Creating empty canvas ad adding other figures to it + convert -size "3300x$combinedheight" xc:white {output.reportfig} + composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} + composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} + composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} + """ + +rule create_pan1c_report: + # Produces a markdown report of chromosomes graphs + input: + figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), + genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + output: + reportfig="output/pan1c."+config['name']+".report.md" + threads: 4 + params: + app_path=config['app.path'] + run: + shell("touch {output.reportfig}") + + # Adding Summary + ## To Do + + # Adding General stats + shell("echo '# General stats' >> {output.reportfig}") + shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + + # Adding Path stats + shell("echo '# Path stats' >> {output.reportfig}") + shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + + # Adding chromosomes figures + fig_list = [fig for fig in input.figs] + print(fig_list) + fig_list.sort() + + shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") + for fig in fig_list: + basename=os.path.basename(fig) + chr_name=basename.split('.')[1] + + shell("echo '## {chr_name}' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") \ No newline at end of file diff --git a/rules/pre-processing.smk b/rules/pre-processing.smk new file mode 100644 index 0000000..3580108 --- /dev/null +++ b/rules/pre-processing.smk @@ -0,0 +1,185 @@ +""" +Pre-processing section : preparing haplotypes ---------------------------------------------------- +""" +rule ragtag_scaffolding: + # Scaffold input haplotype against the reference to infer chromosome scale sequences + input: + ref="data/haplotypes/"+config['reference'], + reffai="data/haplotypes/"+config['reference']+".fai", + refgzi="data/haplotypes/"+config['reference']+".gzi", + fa="data/haplotypes/{haplotype}.fa.gz" + output: + fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", + tar="data/hap.ragtagged/{haplotype}.tar.gz" + threads: 8 + retries: 1 + params: + app_path=config["app.path"] + shell: + """ + bash scripts/ragtagChromInfer.sh \ + -d $(dirname {output.fa})/{wildcards.haplotype} \ + -a {params.app_path} \ + -t {threads} \ + -r {input.ref} \ + -q {input.fa} \ + -o {output.fa} + + #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then + # echo "Error : Empty final fasta" + # exit 1 + #fi + """ + +rule quast_stats: + # Run Quast on ragtagged genomes + input: + fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), + ref="data/haplotypes/"+config['reference'] + output: + report="output/quast/"+config['name']+".quast.report.html" + threads: 8 + params: + app_path=config["app.path"], + pan_name=config["name"] + shell: + """ + apptainer run {params.app_path}/pan1c-env.sif quast.py \ + -t {threads} \ + -o "$(dirname {output.report})" \ + -r {input.ref} \ + --plots-format png \ + --no-read-stats \ + --no-icarus \ + -s \ + --large \ + {input.fas} + + # Compressing temporary files + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} $(dirname {output.report})/contigs_reports/*.fa + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* + + mv $(dirname {output.report})/report.html {output.report} + """ + +rule contig_position: + # Produce figures with contig positions + input: + fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", + fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" + output: + fig="output/chr.contig/{chromosome}.contig.png", + outdir=directory("output/chr.contig/{chromosome}") + threads: 1 + params: + app_path=config["app.path"] + shell: + """ + mkdir {output.outdir} + base="{output.outdir}" + hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') + + # Creating .bands file + apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ + -s 5 \ + -f {input.fa} \ + > $base/{wildcards.chromosome}.gff + + #cat $base/{wildcards.chromosome}.gff + + awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ + sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ + > $base/{wildcards.chromosome}.bands + + #cat $base/{wildcards.chromosome}.bands + + # Creating bed like file + cat {input.fai} | \ + cut -f1,2 | tr '\\t' ',' | \ + sed "s/,/,1,/" | \ + sed "1i chr\\tstart\\tend" | \ + tr ',' '\\t' \ + > $base/{wildcards.chromosome}.tsv + + #cat $base/{wildcards.chromosome}.tsv + + # Creating figure + apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ + -b $base/{wildcards.chromosome}.bands \ + -t $base/{wildcards.chromosome}.tsv \ + -o {output.fig} + """ + +rule clustering: + # Read ragtagged fastas and split chromosome sequences into according bins + input: + expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) + output: + expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) + threads: workflow.cores + params: + app_path=config["app.path"], + pan_name=config["name"] + shell: + """ + mkdir -p $(dirname {output[0]}) + apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ + --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} + for file in $(dirname {output[0]})/*.fa; do + apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file + done + """ + +rule create_allAsm_syri_fig: + # Create a SyRI figure containing all input assemblies + input: + ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", + qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) + output: + fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", + wrkdir=directory('data/asm.syri/all') + threads: 8 + params: + app_path=config["app.path"] + shell: + """ + mkdir {output.wrkdir} + + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ + -t {threads} \ + -d {output.wrkdir} \ + -o $(basename {output.fig}) \ + -r {input.ref} \ + -q "{input.qry}" + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} + """ + +rule create_sglAsm_syri_fig: + # Create a SyRI figure for a single input assembly + input: + ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", + qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" + output: + fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", + wrkdir=directory('data/asm.syri/{haplotype}') + threads: 4 + retries: 1 + resources: + mem_gb=48 + params: + app_path=config["app.path"] + shell: + """ + mkdir -p {output.wrkdir} + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ + -t {threads} \ + -d {output.wrkdir} \ + -o $(basename {output.fig}) \ + -r {input.ref} \ + -q "{input.qry}" + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} + """ \ No newline at end of file -- GitLab From b6252858bb0f5c45e42b29ed5021f27283520905 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:50:59 +0200 Subject: [PATCH 35/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 0577e00..26409c2 100644 --- a/Snakefile +++ b/Snakefile @@ -54,7 +54,7 @@ def which_analysis(): if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures analysis_inputs.append( - expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), + expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES), ) if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( -- GitLab From 49764da51cf0554ed70f701e0b3419f76c6742c8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:53:14 +0200 Subject: [PATCH 36/86] Update Snakefile --- Snakefile | 372 +++++++++++++++++++++++++++--------------------------- 1 file changed, 186 insertions(+), 186 deletions(-) diff --git a/Snakefile b/Snakefile index 26409c2..2f20878 100644 --- a/Snakefile +++ b/Snakefile @@ -1,7 +1,7 @@ configfile: "config.yaml" include: "rules/tools.smk" -include: "rules/pre-processing.smk" +#include: "rules/pre-processing.smk" include: "rules/core.smk" include: "rules/post-analysis.smk" @@ -54,7 +54,7 @@ def which_analysis(): if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures analysis_inputs.append( - expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES), + expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), ) if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( @@ -99,191 +99,191 @@ rule all: # "apptainer run --app samtools {params.app_path}/PanGeTools.sif " # "faidx {input.fa}" -# """ -# Pre-processing section : preparing pggb inputs --------------------------------------------------- -# """ -# rule ragtag_scaffolding: -# # Scaffold input haplotype against the reference to infer chromosome scale sequences -# input: -# ref="data/haplotypes/"+config['reference'], -# reffai="data/haplotypes/"+config['reference']+".fai", -# refgzi="data/haplotypes/"+config['reference']+".gzi", -# fa="data/haplotypes/{haplotype}.fa.gz" -# output: -# fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", -# tar="data/hap.ragtagged/{haplotype}.tar.gz" -# threads: 8 -# retries: 1 -# params: -# app_path=config["app.path"] -# shell: -# """ -# bash scripts/ragtagChromInfer.sh \ -# -d $(dirname {output.fa})/{wildcards.haplotype} \ -# -a {params.app_path} \ -# -t {threads} \ -# -r {input.ref} \ -# -q {input.fa} \ -# -o {output.fa} - -# #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then -# # echo "Error : Empty final fasta" -# # exit 1 -# #fi -# """ - -# rule quast_stats: -# # Run Quast on ragtagged genomes -# input: -# fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), -# ref="data/haplotypes/"+config['reference'] -# output: -# report="output/quast/"+config['name']+".quast.report.html" -# threads: 8 -# params: -# app_path=config["app.path"], -# pan_name=config["name"] -# shell: -# """ -# apptainer run {params.app_path}/pan1c-env.sif quast.py \ -# -t {threads} \ -# -o "$(dirname {output.report})" \ -# -r {input.ref} \ -# --plots-format png \ -# --no-read-stats \ -# --no-icarus \ -# -s \ -# --large \ -# {input.fas} - -# # Compressing temporary files -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# -@ {threads} $(dirname {output.report})/contigs_reports/*.fa -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* - -# mv $(dirname {output.report})/report.html {output.report} -# """ - -# rule contig_position: -# # Produce figures with contig positions -# input: -# fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", -# fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" -# output: -# fig="output/chr.contig/{chromosome}.contig.png", -# outdir=directory("output/chr.contig/{chromosome}") -# threads: 1 -# params: -# app_path=config["app.path"] -# shell: -# """ -# mkdir {output.outdir} -# base="{output.outdir}" -# hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') +""" +Pre-processing section : preparing pggb inputs --------------------------------------------------- +""" +rule ragtag_scaffolding: + # Scaffold input haplotype against the reference to infer chromosome scale sequences + input: + ref="data/haplotypes/"+config['reference'], + reffai="data/haplotypes/"+config['reference']+".fai", + refgzi="data/haplotypes/"+config['reference']+".gzi", + fa="data/haplotypes/{haplotype}.fa.gz" + output: + fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", + tar="data/hap.ragtagged/{haplotype}.tar.gz" + threads: 8 + retries: 1 + params: + app_path=config["app.path"] + shell: + """ + bash scripts/ragtagChromInfer.sh \ + -d $(dirname {output.fa})/{wildcards.haplotype} \ + -a {params.app_path} \ + -t {threads} \ + -r {input.ref} \ + -q {input.fa} \ + -o {output.fa} + + #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then + # echo "Error : Empty final fasta" + # exit 1 + #fi + """ + +rule quast_stats: + # Run Quast on ragtagged genomes + input: + fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), + ref="data/haplotypes/"+config['reference'] + output: + report="output/quast/"+config['name']+".quast.report.html" + threads: 8 + params: + app_path=config["app.path"], + pan_name=config["name"] + shell: + """ + apptainer run {params.app_path}/pan1c-env.sif quast.py \ + -t {threads} \ + -o "$(dirname {output.report})" \ + -r {input.ref} \ + --plots-format png \ + --no-read-stats \ + --no-icarus \ + -s \ + --large \ + {input.fas} + + # Compressing temporary files + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} $(dirname {output.report})/contigs_reports/*.fa + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* + + mv $(dirname {output.report})/report.html {output.report} + """ + +rule contig_position: + # Produce figures with contig positions + input: + fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", + fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" + output: + fig="output/chr.contig/{chromosome}.contig.png", + outdir=directory("output/chr.contig/{chromosome}") + threads: 1 + params: + app_path=config["app.path"] + shell: + """ + mkdir {output.outdir} + base="{output.outdir}" + hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') -# # Creating .bands file -# apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ -# -s 5 \ -# -f {input.fa} \ -# > $base/{wildcards.chromosome}.gff - -# #cat $base/{wildcards.chromosome}.gff - -# awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ -# sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ -# > $base/{wildcards.chromosome}.bands - -# #cat $base/{wildcards.chromosome}.bands - -# # Creating bed like file -# cat {input.fai} | \ -# cut -f1,2 | tr '\\t' ',' | \ -# sed "s/,/,1,/" | \ -# sed "1i chr\\tstart\\tend" | \ -# tr ',' '\\t' \ -# > $base/{wildcards.chromosome}.tsv - -# #cat $base/{wildcards.chromosome}.tsv - -# # Creating figure -# apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ -# -b $base/{wildcards.chromosome}.bands \ -# -t $base/{wildcards.chromosome}.tsv \ -# -o {output.fig} -# """ - -# rule clustering: -# # Read ragtagged fastas and split chromosome sequences into according bins -# input: -# expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) -# output: -# expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) -# threads: workflow.cores -# params: -# app_path=config["app.path"], -# pan_name=config["name"] -# shell: -# """ -# mkdir -p $(dirname {output[0]}) -# apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ -# --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} -# for file in $(dirname {output[0]})/*.fa; do -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file -# done -# """ - -# rule create_allAsm_syri_fig: -# # Create a SyRI figure containing all input assemblies -# input: -# ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", -# qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) -# output: -# fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", -# wrkdir=directory('data/asm.syri/all') -# threads: 8 -# params: -# app_path=config["app.path"] -# shell: -# """ -# mkdir {output.wrkdir} - -# bash scripts/getSyriFigs.sh \ -# -a {params.app_path} \ -# -t {threads} \ -# -d {output.wrkdir} \ -# -o $(basename {output.fig}) \ -# -r {input.ref} \ -# -q "{input.qry}" -# mv {output.wrkdir}/$(basename {output.fig}) {output.fig} -# """ - -# rule create_sglAsm_syri_fig: -# # Create a SyRI figure for a single input assembly -# input: -# ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", -# qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" -# output: -# fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", -# wrkdir=directory('data/asm.syri/{haplotype}') -# threads: 4 -# retries: 1 -# resources: -# mem_gb=48 -# params: -# app_path=config["app.path"] -# shell: -# """ -# mkdir -p {output.wrkdir} -# bash scripts/getSyriFigs.sh \ -# -a {params.app_path} \ -# -t {threads} \ -# -d {output.wrkdir} \ -# -o $(basename {output.fig}) \ -# -r {input.ref} \ -# -q "{input.qry}" -# mv {output.wrkdir}/$(basename {output.fig}) {output.fig} -# """ + # Creating .bands file + apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ + -s 5 \ + -f {input.fa} \ + > $base/{wildcards.chromosome}.gff + + #cat $base/{wildcards.chromosome}.gff + + awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ + sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ + > $base/{wildcards.chromosome}.bands + + #cat $base/{wildcards.chromosome}.bands + + # Creating bed like file + cat {input.fai} | \ + cut -f1,2 | tr '\\t' ',' | \ + sed "s/,/,1,/" | \ + sed "1i chr\\tstart\\tend" | \ + tr ',' '\\t' \ + > $base/{wildcards.chromosome}.tsv + + #cat $base/{wildcards.chromosome}.tsv + + # Creating figure + apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ + -b $base/{wildcards.chromosome}.bands \ + -t $base/{wildcards.chromosome}.tsv \ + -o {output.fig} + """ + +rule clustering: + # Read ragtagged fastas and split chromosome sequences into according bins + input: + expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) + output: + expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) + threads: workflow.cores + params: + app_path=config["app.path"], + pan_name=config["name"] + shell: + """ + mkdir -p $(dirname {output[0]}) + apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ + --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} + for file in $(dirname {output[0]})/*.fa; do + apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file + done + """ + +rule create_allAsm_syri_fig: + # Create a SyRI figure containing all input assemblies + input: + ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", + qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) + output: + fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", + wrkdir=directory('data/asm.syri/all') + threads: 8 + params: + app_path=config["app.path"] + shell: + """ + mkdir {output.wrkdir} + + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ + -t {threads} \ + -d {output.wrkdir} \ + -o $(basename {output.fig}) \ + -r {input.ref} \ + -q "{input.qry}" + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} + """ + +rule create_sglAsm_syri_fig: + # Create a SyRI figure for a single input assembly + input: + ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", + qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" + output: + fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", + wrkdir=directory('data/asm.syri/{haplotype}') + threads: 4 + retries: 1 + resources: + mem_gb=48 + params: + app_path=config["app.path"] + shell: + """ + mkdir -p {output.wrkdir} + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ + -t {threads} \ + -d {output.wrkdir} \ + -o $(basename {output.fig}) \ + -r {input.ref} \ + -q "{input.qry}" + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} + """ # """ # Core section : Running PGGB -- GitLab From 97b12904ca91a4400f58768558b83fc92a6f50e8 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 16:59:15 +0200 Subject: [PATCH 37/86] Revert "Update Snakefile" This reverts commit 8b78a0262b616a724a1b13074dcab1ac45515c0d. --- Snakefile | 953 +++++++++++++++++++-------------------- rules/core.smk | 346 -------------- rules/post-analysis.smk | 118 ----- rules/pre-processing.smk | 185 -------- 4 files changed, 469 insertions(+), 1133 deletions(-) delete mode 100644 rules/core.smk delete mode 100644 rules/post-analysis.smk delete mode 100644 rules/pre-processing.smk diff --git a/Snakefile b/Snakefile index 2f20878..e3818f2 100644 --- a/Snakefile +++ b/Snakefile @@ -1,9 +1,7 @@ configfile: "config.yaml" include: "rules/tools.smk" -#include: "rules/pre-processing.smk" -include: "rules/core.smk" -include: "rules/post-analysis.smk" + ## Modules import os @@ -82,23 +80,6 @@ rule all: "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line) which_analysis() -# """ -# Tool rules --------------------------------------------------------------------------------------- -# """ -# rule samtools_index: -# # Samtools faidx to index compressed fasta -# input: -# fa="{sample}.fa.gz" -# output: -# "{sample}.fa.gz.fai", -# "{sample}.fa.gz.gzi" -# threads: 1 -# params: -# app_path=config["app.path"] -# shell: -# "apptainer run --app samtools {params.app_path}/PanGeTools.sif " -# "faidx {input.fa}" - """ Pre-processing section : preparing pggb inputs --------------------------------------------------- """ @@ -285,471 +266,475 @@ rule create_sglAsm_syri_fig: mv {output.wrkdir}/$(basename {output.fig}) {output.fig} """ -# """ -# Core section : Running PGGB -# """ - -# rule create_pggb_input_syri_fig: -# input: -# fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' -# output: -# fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", -# wrkdir=directory('data/chrInput/syri/{chromosome}') -# threads: 8 -# params: -# app_path=config["app.path"], -# ref=config['reference'] -# shell: -# """ -# mkdir {output.wrkdir} -# refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) - -# ## Creating single fasta from multifasta -# zcat {input.fasta} | awk -F"#" \ -# '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' +""" + +Core section : Running PGGB + +""" + +rule create_pggb_input_syri_fig: + input: + fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' + output: + fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", + wrkdir=directory('data/chrInput/syri/{chromosome}') + threads: 8 + params: + app_path=config["app.path"], + ref=config['reference'] + shell: + """ + mkdir {output.wrkdir} + refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) + + ## Creating single fasta from multifasta + zcat {input.fasta} | awk -F"#" \ + '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' -# ## Getting the list of sequences -# asmList=() -# for file in {output.wrkdir}/*.fa; do -# asm="$(basename $file .fa | cut -f1 -d"#").fa" -# mv $file "$(dirname $file)/$asm" -# asmList+=("$asm") -# done - -# bash scripts/getSyriFigs.sh \ -# -a {params.app_path} \ -# -t {threads} \ -# -d {output.wrkdir} \ -# -o $(basename {output.fig}) \ -# -r {output.wrkdir}/"${refname}.fa" \ -# -q "${asmList[@]}" -# mv {output.wrkdir}/$(basename {output.fig}) {output.fig} -# rm {output.wrkdir}/*.fa -# """ - -# rule wfmash_on_chr: -# # Run wfmash on a specific chromosome input -# input: -# fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', -# fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' -# output: -# mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), -# aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") -# threads: 16 -# params: -# app_path=config['app.path'], -# segment_length=config['wfmash.segment_length'], -# mapping_id=config['wfmash.mapping_id'], -# wfmash_sec=config['wfmash.secondary'] -# log: -# cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", -# time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", -# cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", -# time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", -# shell: -# """ -# ## Mapping -# /usr/bin/time -v -o {log.time_map} \ -# apptainer exec {params.app_path}/pggb.sif wfmash \ -# -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ -# -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ -# --tmp-base $(dirname {output.aln}) \ -# {input.fa} --approx-map \ -# 1> {output.mapping} \ -# 2> >(tee {log.cmd_map} >&2) - -# ## Aligning -# /usr/bin/time -v -o {log.time_aln} \ -# apptainer exec {params.app_path}/pggb.sif wfmash \ -# -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ -# -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ -# --tmp-base $(dirname {output.aln}) \ -# {input.fa} -i {output.mapping} \ -# --invert-filtering \ -# 1> {output.aln} \ -# 2> >(tee {log.cmd_aln} >&2) - -# # Compressing -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# -@ {threads} -k {output.mapping} - -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# -@ {threads} -k {output.aln} -# """ - -# rule seqwish: -# # Run seqwish on alignement produced by wfmash -# input: -# fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', -# aln=rules.wfmash_on_chr.output.aln -# output: -# temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") -# threads: 8 -# params: -# app_path=config['app.path'], -# seqwish=config['seqwish.params'] -# log: -# cmd="logs/pggb/{chromosome}.seqwish.cmd.log", -# time="logs/pggb/{chromosome}.seqwish.time.log" -# shell: -# """ -# /usr/bin/time -v -o {log.time} \ -# apptainer exec {params.app_path}/pggb.sif seqwish \ -# -s {input.fa} -p {input.aln} -g {output} \ -# {params.seqwish} -t {threads} \ -# --temp-dir $(dirname {output}) -P 2>&1 | \ -# tee {log.cmd} - -# # Compressing -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# -@ {threads} -k {output} -# """ - -# rule gfaffix_on_chr: -# # Run gfaffix on seqwish graph -# input: -# rules.seqwish.output -# output: -# gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), -# transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" -# threads: 1 -# params: -# app_path=config['app.path'] -# log: -# time="logs/pggb/{chromosome}.gfaffix.time.log" -# shell: -# """ -# /usr/bin/time -v -o {log.time} \ -# apptainer exec {params.app_path}/pggb.sif gfaffix \ -# {input} -o {output.gfa} -t {output.transform} \ -# > /dev/null - -# # Compressing -# apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# -@ {threads} -k {output.gfa} -# """ - -# rule odgi_postprocessing: -# # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph -# input: -# rules.gfaffix_on_chr.output.gfa -# output: -# gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' -# threads: 8 -# params: -# app_path=config['app.path'] -# log: -# cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", -# time_build="logs/pggb/{chromosome}.odgi_build.time.log", -# cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", -# time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", -# cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", -# time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", -# cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", -# time_view="logs/pggb/{chromosome}.odgi_view.time.log" -# shell: -# """ -# OGfile="$(dirname {input})/$(basename {input} .gfa)" - -# ## Converting to odgi format and optimizing namespace -# /usr/bin/time -v -o {log.time_build} \ -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ -# tee {log.cmd_build} + ## Getting the list of sequences + asmList=() + for file in {output.wrkdir}/*.fa; do + asm="$(basename $file .fa | cut -f1 -d"#").fa" + mv $file "$(dirname $file)/$asm" + asmList+=("$asm") + done + + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ + -t {threads} \ + -d {output.wrkdir} \ + -o $(basename {output.fig}) \ + -r {output.wrkdir}/"${refname}.fa" \ + -q "${asmList[@]}" + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} + rm {output.wrkdir}/*.fa + """ + +rule wfmash_on_chr: + # Run wfmash on a specific chromosome input + input: + fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', + fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' + output: + mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), + aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") + threads: 16 + params: + app_path=config['app.path'], + segment_length=config['wfmash.segment_length'], + mapping_id=config['wfmash.mapping_id'], + wfmash_sec=config['wfmash.secondary'] + log: + cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", + time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", + cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", + time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", + shell: + """ + ## Mapping + /usr/bin/time -v -o {log.time_map} \ + apptainer exec {params.app_path}/pggb.sif wfmash \ + -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ + -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ + --tmp-base $(dirname {output.aln}) \ + {input.fa} --approx-map \ + 1> {output.mapping} \ + 2> >(tee {log.cmd_map} >&2) + + ## Aligning + /usr/bin/time -v -o {log.time_aln} \ + apptainer exec {params.app_path}/pggb.sif wfmash \ + -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ + -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ + --tmp-base $(dirname {output.aln}) \ + {input.fa} -i {output.mapping} \ + --invert-filtering \ + 1> {output.aln} \ + 2> >(tee {log.cmd_aln} >&2) + + # Compressing + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output.mapping} + + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output.aln} + """ + +rule seqwish: + # Run seqwish on alignement produced by wfmash + input: + fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', + aln=rules.wfmash_on_chr.output.aln + output: + temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") + threads: 8 + params: + app_path=config['app.path'], + seqwish=config['seqwish.params'] + log: + cmd="logs/pggb/{chromosome}.seqwish.cmd.log", + time="logs/pggb/{chromosome}.seqwish.time.log" + shell: + """ + /usr/bin/time -v -o {log.time} \ + apptainer exec {params.app_path}/pggb.sif seqwish \ + -s {input.fa} -p {input.aln} -g {output} \ + {params.seqwish} -t {threads} \ + --temp-dir $(dirname {output}) -P 2>&1 | \ + tee {log.cmd} + + # Compressing + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output} + """ + +rule gfaffix_on_chr: + # Run gfaffix on seqwish graph + input: + rules.seqwish.output + output: + gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), + transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" + threads: 1 + params: + app_path=config['app.path'] + log: + time="logs/pggb/{chromosome}.gfaffix.time.log" + shell: + """ + /usr/bin/time -v -o {log.time} \ + apptainer exec {params.app_path}/pggb.sif gfaffix \ + {input} -o {output.gfa} -t {output.transform} \ + > /dev/null + + # Compressing + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} -k {output.gfa} + """ + +rule odgi_postprocessing: + # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph + input: + rules.gfaffix_on_chr.output.gfa + output: + gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + threads: 8 + params: + app_path=config['app.path'] + log: + cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", + time_build="logs/pggb/{chromosome}.odgi_build.time.log", + cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", + time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", + cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", + time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", + cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", + time_view="logs/pggb/{chromosome}.odgi_view.time.log" + shell: + """ + OGfile="$(dirname {input})/$(basename {input} .gfa)" + + ## Converting to odgi format and optimizing namespace + /usr/bin/time -v -o {log.time_build} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ + tee {log.cmd_build} -# ## Unchoping the nodes (merges unitigs into single nodes) -# /usr/bin/time -v -o {log.time_unchop} \ -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ -# tee {log.cmd_unchop} - -# ## Sorting the nodes -# /usr/bin/time -v -o {log.time_sort} \ -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ -# -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ -# tee {log.cmd_sort} - -# ## Exporting to gfa -# /usr/bin/time -v -o {log.time_view} \ -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# view -i $OGfile.unchoped.sorted.og -g \ -# 1> {output.gfa} \ -# 2> >(tee {log.cmd_view} >&2) - -# ## Removing .og files for space savings -# rm $(dirname {input})/*.og - -# ## Adding metadata -# python scripts/getTags.py \ -# --appdir {params.app_path} --config-file config.yaml \ -# > "$(dirname {input})/metadata.txt" -# sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} - -# # Compressing -# # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -# # -@ {threads} -k {output.gfa} -# """ - -# rule generate_graph_list: -# # Generate a text file containing all created graphs -# input: -# expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) -# output: -# "data/chrGraphs/graphsList.txt" -# threads: 1 -# run: -# with open(output[0], "w") as handle: -# for file in input: -# handle.write(file+"\n") - -# rule graph_squeeze: -# # Using odgi to merge every subgraphs into a final one -# input: -# glist="data/chrGraphs/graphsList.txt", -# graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) -# output: -# "output/pan1c."+config['name']+".gfa" -# threads: 16 -# params: -# app_path=config['app.path'] -# shell: -# """ -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# squeeze -t {threads} -O -P -f {input.glist} -o {output}.og -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# view -t {threads} -P -i {output}.og -g > {output} -# rm {output}.og -# """ - -# rule graph_stats: -# # Using GFAstats to produce stats on every chromosome graphs -# input: -# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' -# output: -# genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", -# pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" -# threads: 4 -# params: -# app_path=config['app.path'], -# pan_name=config['name'] -# shell: -# """ -# apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ -# -g {input.graph} -P \ -# -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ -# -t {threads} -# """ - -# rule graph_figs: -# # Creating figures using odgi viz -# input: -# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' -# output: -# oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", -# pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" -# threads: 4 -# params: -# app_path=config['app.path'], -# oneDviz=config['odgi.1Dviz.params'], -# pcov=config['odgi.pcov.params'] -# shell: -# """ -# ## 1D viz -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P - -# ## Pcov viz -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P -# """ - -# rule aggregate_graphs_stats: -# # Reading and merging all stats files from chromosome graphs into a .tsv. -# input: -# genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) -# output: -# genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", -# pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" -# params: -# app_path=config['app.path'], -# pan_name=config['name'] -# shell: -# """ -# apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ -# --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ -# --outputPaths {output.pathstats} --panname {params.pan_name} -# """ - -# rule final_graph_tagging: -# # Add metadata to the final GFA -# input: -# graph="output/pan1c."+config['name']+".gfa", -# output: -# "output/pan1c."+config['name']+".gfa.metadata" -# threads: 1 -# params: -# app_path=config['app.path'], -# pan_name=config['name'] -# shell: -# """ -# python scripts/getTags.py \ -# --appdir {params.app_path} --config-file config.yaml > {output} -# sed -i '/^H/r {output}' {input.graph} -# """ - -# rule pggb_input_stats: -# # Produces statistics on pggb input sequences -# input: -# flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" -# output: -# "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" -# params: -# app_path=config['app.path'], -# pan_name=config['name'] -# shell: -# """ -# apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ -# -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} -# """ - -# rule core_statistics: -# # Aggregate chrInput, chrGraph and pggb statistics into a single tsv -# input: -# chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", -# chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" -# output: -# tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", -# dir=directory("output/pggb.usage.figs") -# params: -# app_path=config['app.path'], -# pan_name=config['name'] -# shell: -# """ -# mkdir -p {output.dir} -# apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ -# --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ -# --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} -# """ - -# """ -# Post-processing section : -# The graph for each chromosome are made as well as some basic statistics. -# In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...). -# It also contains rules to use the graph itself. -# """ -# rule get_pav: -# # Create PAV matrix readable by panache for a given chromosome scale graph -# input: -# "data/chrGraphs/graphsList.txt" -# output: -# directory("output/pav.matrices") -# threads: 16 -# params: -# app_path=config['app.path'] -# run: -# shell("mkdir {output}") -# # Getting the list of graphs -# with open(input[0]) as handle: -# graphList = [graph.rstrip("\n") for graph in handle.readlines()] -# # Iterating over graphs -# for graph in graphList: -# shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") - -# rule panacus_stats: -# # Produces panacus reports for a chromosome graph -# input: -# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' -# output: -# html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' -# params: -# app_path=config['app.path'], -# pan_name=config['name'], -# refname=config['reference'] -# threads: 8 -# shell: -# """ -# bash scripts/getPanacusHG.sh \ -# -g {input.graph} \ -# -r $(basename {params.refname} .fa.gz) \ -# -d data/chrGraphs/{wildcards.chromosome} \ -# -o {output.html} \ -# -a {params.app_path} \ -# -t {threads} -# """ - -# rule create_pan1c_report_fig: -# # Produces a markdown report figure of chromosomes graphs -# input: -# graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', -# contigfig="output/chr.contig/{chromosome}.contig.png", -# output: -# odgifig=temp("tmp/{chromosome}.odgi.png"), -# namefig=temp("tmp/{chromosome}.name.png"), -# reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" -# threads: 4 -# params: -# app_path=config['app.path'] -# shell: -# """ -# ## Odgi 1D viz -# apptainer run --app odgi {params.app_path}/PanGeTools.sif \ -# viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P - -# ## Getting legend from contig figure -# convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} - -# odgheight=$(identify -ping -format '%h' {output.odgifig}) -# ctgheight=$(identify -ping -format '%h' {input.contigfig}) - -# combinedheight=$(($odgheight+$ctgheight)) - -# echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" - -# ## Creating empty canvas ad adding other figures to it -# convert -size "3300x$combinedheight" xc:white {output.reportfig} -# composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} -# composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} -# composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} -# """ - -# rule create_pan1c_report: -# # Produces a markdown report of chromosomes graphs -# input: -# figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), -# genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", -# pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" -# output: -# reportfig="output/pan1c."+config['name']+".report.md" -# threads: 4 -# params: -# app_path=config['app.path'] -# run: -# shell("touch {output.reportfig}") - -# # Adding Summary -# ## To Do - -# # Adding General stats -# shell("echo '# General stats' >> {output.reportfig}") -# shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") -# shell("echo '' >> {output.reportfig}") - -# # Adding Path stats -# shell("echo '# Path stats' >> {output.reportfig}") -# shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") -# shell("echo '' >> {output.reportfig}") - -# # Adding chromosomes figures -# fig_list = [fig for fig in input.figs] -# print(fig_list) -# fig_list.sort() + ## Unchoping the nodes (merges unitigs into single nodes) + /usr/bin/time -v -o {log.time_unchop} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ + tee {log.cmd_unchop} + + ## Sorting the nodes + /usr/bin/time -v -o {log.time_sort} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ + -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ + tee {log.cmd_sort} + + ## Exporting to gfa + /usr/bin/time -v -o {log.time_view} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + view -i $OGfile.unchoped.sorted.og -g \ + 1> {output.gfa} \ + 2> >(tee {log.cmd_view} >&2) + + ## Removing .og files for space savings + rm $(dirname {input})/*.og + + ## Adding metadata + python scripts/getTags.py \ + --appdir {params.app_path} --config-file config.yaml \ + > "$(dirname {input})/metadata.txt" + sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} + + # Compressing + # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + # -@ {threads} -k {output.gfa} + """ + +rule generate_graph_list: + # Generate a text file containing all created graphs + input: + expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) + output: + "data/chrGraphs/graphsList.txt" + threads: 1 + run: + with open(output[0], "w") as handle: + for file in input: + handle.write(file+"\n") + +rule graph_squeeze: + # Using odgi to merge every subgraphs into a final one + input: + glist="data/chrGraphs/graphsList.txt", + graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) + output: + "output/pan1c."+config['name']+".gfa" + threads: 16 + params: + app_path=config['app.path'] + shell: + """ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + squeeze -t {threads} -O -P -f {input.glist} -o {output}.og + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + view -t {threads} -P -i {output}.og -g > {output} + rm {output}.og + """ + +rule graph_stats: + # Using GFAstats to produce stats on every chromosome graphs + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + output: + genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", + pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" + threads: 4 + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ + -g {input.graph} -P \ + -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ + -t {threads} + """ + +rule graph_figs: + # Creating figures using odgi viz + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + output: + oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", + pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" + threads: 4 + params: + app_path=config['app.path'], + oneDviz=config['odgi.1Dviz.params'], + pcov=config['odgi.pcov.params'] + shell: + """ + ## 1D viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P + + ## Pcov viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P + """ + +rule aggregate_graphs_stats: + # Reading and merging all stats files from chromosome graphs into a .tsv. + input: + genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) + output: + genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ + --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ + --outputPaths {output.pathstats} --panname {params.pan_name} + """ + +rule final_graph_tagging: + # Add metadata to the final GFA + input: + graph="output/pan1c."+config['name']+".gfa", + output: + "output/pan1c."+config['name']+".gfa.metadata" + threads: 1 + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + python scripts/getTags.py \ + --appdir {params.app_path} --config-file config.yaml > {output} + sed -i '/^H/r {output}' {input.graph} + """ + +rule pggb_input_stats: + # Produces statistics on pggb input sequences + input: + flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + output: + "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ + -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} + """ + +rule core_statistics: + # Aggregate chrInput, chrGraph and pggb statistics into a single tsv + input: + chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", + chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + output: + tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", + dir=directory("output/pggb.usage.figs") + params: + app_path=config['app.path'], + pan_name=config['name'] + shell: + """ + mkdir -p {output.dir} + apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ + --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ + --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} + """ + +""" + +Post-processing section : + The graph for each chromosome are made as well as some basic statistics. + In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...). + It also contains rules to use the graph itself. + +""" +rule get_pav: + # Create PAV matrix readable by panache for a given chromosome scale graph + input: + "data/chrGraphs/graphsList.txt" + output: + directory("output/pav.matrices") + threads: 16 + params: + app_path=config['app.path'] + run: + shell("mkdir {output}") + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: + shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") + +rule panacus_stats: + # Produces panacus reports for a chromosome graph + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' + output: + html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' + params: + app_path=config['app.path'], + pan_name=config['name'], + refname=config['reference'] + threads: 8 + shell: + """ + bash scripts/getPanacusHG.sh \ + -g {input.graph} \ + -r $(basename {params.refname} .fa.gz) \ + -d data/chrGraphs/{wildcards.chromosome} \ + -o {output.html} \ + -a {params.app_path} \ + -t {threads} + """ + +rule create_pan1c_report_fig: + # Produces a markdown report figure of chromosomes graphs + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', + contigfig="output/chr.contig/{chromosome}.contig.png", + output: + odgifig=temp("tmp/{chromosome}.odgi.png"), + namefig=temp("tmp/{chromosome}.name.png"), + reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" + threads: 4 + params: + app_path=config['app.path'] + shell: + """ + ## Odgi 1D viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P + + ## Getting legend from contig figure + convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} + + odgheight=$(identify -ping -format '%h' {output.odgifig}) + ctgheight=$(identify -ping -format '%h' {input.contigfig}) + + combinedheight=$(($odgheight+$ctgheight)) + + echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" + + ## Creating empty canvas ad adding other figures to it + convert -size "3300x$combinedheight" xc:white {output.reportfig} + composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} + composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} + composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} + """ + +rule create_pan1c_report: + # Produces a markdown report of chromosomes graphs + input: + figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), + genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + output: + reportfig="output/pan1c."+config['name']+".report.md" + threads: 4 + params: + app_path=config['app.path'] + run: + shell("touch {output.reportfig}") + + # Adding Summary + ## To Do + + # Adding General stats + shell("echo '# General stats' >> {output.reportfig}") + shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + + # Adding Path stats + shell("echo '# Path stats' >> {output.reportfig}") + shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + + # Adding chromosomes figures + fig_list = [fig for fig in input.figs] + print(fig_list) + fig_list.sort() -# shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") -# for fig in fig_list: -# basename=os.path.basename(fig) -# chr_name=basename.split('.')[1] + shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") + for fig in fig_list: + basename=os.path.basename(fig) + chr_name=basename.split('.')[1] -# shell("echo '## {chr_name}' >> {output.reportfig}") -# shell("echo '' >> {output.reportfig}") -# shell("echo '' >> {output.reportfig}") \ No newline at end of file + shell("echo '## {chr_name}' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") + shell("echo '' >> {output.reportfig}") \ No newline at end of file diff --git a/rules/core.smk b/rules/core.smk deleted file mode 100644 index 5473943..0000000 --- a/rules/core.smk +++ /dev/null @@ -1,346 +0,0 @@ -""" -Core section : Running PGGB on each chromosome and generating whole pangenome ------------------- -""" - -rule create_pggb_input_syri_fig: - input: - fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' - output: - fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", - wrkdir=directory('data/chrInput/syri/{chromosome}') - threads: 8 - params: - app_path=config["app.path"], - ref=config['reference'] - shell: - """ - mkdir {output.wrkdir} - refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) - - ## Creating single fasta from multifasta - zcat {input.fasta} | awk -F"#" \ - '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' - - ## Getting the list of sequences - asmList=() - for file in {output.wrkdir}/*.fa; do - asm="$(basename $file .fa | cut -f1 -d"#").fa" - mv $file "$(dirname $file)/$asm" - asmList+=("$asm") - done - - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {output.wrkdir}/"${refname}.fa" \ - -q "${asmList[@]}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - rm {output.wrkdir}/*.fa - """ - -rule wfmash_on_chr: - # Run wfmash on a specific chromosome input - input: - fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', - fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' - output: - mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), - aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") - threads: 16 - params: - app_path=config['app.path'], - segment_length=config['wfmash.segment_length'], - mapping_id=config['wfmash.mapping_id'], - wfmash_sec=config['wfmash.secondary'] - log: - cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log", - time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log", - cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log", - time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log", - shell: - """ - ## Mapping - /usr/bin/time -v -o {log.time_map} \ - apptainer exec {params.app_path}/pggb.sif wfmash \ - -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ - -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ - --tmp-base $(dirname {output.aln}) \ - {input.fa} --approx-map \ - 1> {output.mapping} \ - 2> >(tee {log.cmd_map} >&2) - - ## Aligning - /usr/bin/time -v -o {log.time_aln} \ - apptainer exec {params.app_path}/pggb.sif wfmash \ - -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ - -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ - --tmp-base $(dirname {output.aln}) \ - {input.fa} -i {output.mapping} \ - --invert-filtering \ - 1> {output.aln} \ - 2> >(tee {log.cmd_aln} >&2) - - # Compressing - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output.mapping} - - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output.aln} - """ - -rule seqwish: - # Run seqwish on alignement produced by wfmash - input: - fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', - aln=rules.wfmash_on_chr.output.aln - output: - temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") - threads: 8 - params: - app_path=config['app.path'], - seqwish=config['seqwish.params'] - log: - cmd="logs/pggb/{chromosome}.seqwish.cmd.log", - time="logs/pggb/{chromosome}.seqwish.time.log" - shell: - """ - /usr/bin/time -v -o {log.time} \ - apptainer exec {params.app_path}/pggb.sif seqwish \ - -s {input.fa} -p {input.aln} -g {output} \ - {params.seqwish} -t {threads} \ - --temp-dir $(dirname {output}) -P 2>&1 | \ - tee {log.cmd} - - # Compressing - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output} - """ - -rule gfaffix_on_chr: - # Run gfaffix on seqwish graph - input: - rules.seqwish.output - output: - gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), - transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" - threads: 1 - params: - app_path=config['app.path'] - log: - time="logs/pggb/{chromosome}.gfaffix.time.log" - shell: - """ - /usr/bin/time -v -o {log.time} \ - apptainer exec {params.app_path}/pggb.sif gfaffix \ - {input} -o {output.gfa} -t {output.transform} \ - > /dev/null - - # Compressing - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} -k {output.gfa} - """ - -rule odgi_postprocessing: - # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph - input: - rules.gfaffix_on_chr.output.gfa - output: - gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - threads: 8 - params: - app_path=config['app.path'] - log: - cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", - time_build="logs/pggb/{chromosome}.odgi_build.time.log", - cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log", - time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log", - cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log", - time_sort="logs/pggb/{chromosome}.odgi_sort.time.log", - cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log", - time_view="logs/pggb/{chromosome}.odgi_view.time.log" - shell: - """ - OGfile="$(dirname {input})/$(basename {input} .gfa)" - - ## Converting to odgi format and optimizing namespace - /usr/bin/time -v -o {log.time_build} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ - tee {log.cmd_build} - - ## Unchoping the nodes (merges unitigs into single nodes) - /usr/bin/time -v -o {log.time_unchop} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ - tee {log.cmd_unchop} - - ## Sorting the nodes - /usr/bin/time -v -o {log.time_sort} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ - -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ - tee {log.cmd_sort} - - ## Exporting to gfa - /usr/bin/time -v -o {log.time_view} \ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - view -i $OGfile.unchoped.sorted.og -g \ - 1> {output.gfa} \ - 2> >(tee {log.cmd_view} >&2) - - ## Removing .og files for space savings - rm $(dirname {input})/*.og - - ## Adding metadata - python scripts/getTags.py \ - --appdir {params.app_path} --config-file config.yaml \ - > "$(dirname {input})/metadata.txt" - sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} - - # Compressing - # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - # -@ {threads} -k {output.gfa} - """ - -rule generate_graph_list: - # Generate a text file containing all created graphs - input: - expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) - output: - "data/chrGraphs/graphsList.txt" - threads: 1 - run: - with open(output[0], "w") as handle: - for file in input: - handle.write(file+"\n") - -rule graph_squeeze: - # Using odgi to merge every subgraphs into a final one - input: - glist="data/chrGraphs/graphsList.txt", - graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) - output: - "output/pan1c."+config['name']+".gfa" - threads: 16 - params: - app_path=config['app.path'] - shell: - """ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - squeeze -t {threads} -O -P -f {input.glist} -o {output}.og - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - view -t {threads} -P -i {output}.og -g > {output} - rm {output}.og - """ - -rule graph_stats: - # Using GFAstats to produce stats on every chromosome graphs - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - output: - genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", - pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" - threads: 4 - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ - -g {input.graph} -P \ - -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ - -t {threads} - """ - -rule graph_figs: - # Creating figures using odgi viz - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - output: - oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", - pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" - threads: 4 - params: - app_path=config['app.path'], - oneDviz=config['odgi.1Dviz.params'], - pcov=config['odgi.pcov.params'] - shell: - """ - ## 1D viz - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P - - ## Pcov viz - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P - """ - -rule aggregate_graphs_stats: - # Reading and merging all stats files from chromosome graphs into a .tsv. - input: - genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) - output: - genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ - --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ - --outputPaths {output.pathstats} --panname {params.pan_name} - """ - -rule final_graph_tagging: - # Add metadata to the final GFA - input: - graph="output/pan1c."+config['name']+".gfa", - output: - "output/pan1c."+config['name']+".gfa.metadata" - threads: 1 - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - python scripts/getTags.py \ - --appdir {params.app_path} --config-file config.yaml > {output} - sed -i '/^H/r {output}' {input.graph} - """ - -rule pggb_input_stats: - # Produces statistics on pggb input sequences - input: - flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" - output: - "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ - -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} - """ - -rule core_statistics: - # Aggregate chrInput, chrGraph and pggb statistics into a single tsv - input: - chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", - chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" - output: - tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", - dir=directory("output/pggb.usage.figs") - params: - app_path=config['app.path'], - pan_name=config['name'] - shell: - """ - mkdir -p {output.dir} - apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ - --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ - --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} - """ \ No newline at end of file diff --git a/rules/post-analysis.smk b/rules/post-analysis.smk deleted file mode 100644 index 3041501..0000000 --- a/rules/post-analysis.smk +++ /dev/null @@ -1,118 +0,0 @@ -""" -Post-analysis section --------------------------------------------------------------------------- -""" -rule get_pav: - # Create PAV matrix readable by panache for a given chromosome scale graph - input: - "data/chrGraphs/graphsList.txt" - output: - directory("output/pav.matrices") - threads: 16 - params: - app_path=config['app.path'] - run: - shell("mkdir {output}") - # Getting the list of graphs - with open(input[0]) as handle: - graphList = [graph.rstrip("\n") for graph in handle.readlines()] - # Iterating over graphs - for graph in graphList: - shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") - -rule panacus_stats: - # Produces panacus reports for a chromosome graph - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' - output: - html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' - params: - app_path=config['app.path'], - pan_name=config['name'], - refname=config['reference'] - threads: 8 - shell: - """ - bash scripts/getPanacusHG.sh \ - -g {input.graph} \ - -r $(basename {params.refname} .fa.gz) \ - -d data/chrGraphs/{wildcards.chromosome} \ - -o {output.html} \ - -a {params.app_path} \ - -t {threads} - """ - -rule create_pan1c_report_fig: - # Produces a markdown report figure of chromosomes graphs - input: - graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', - contigfig="output/chr.contig/{chromosome}.contig.png", - output: - odgifig=temp("tmp/{chromosome}.odgi.png"), - namefig=temp("tmp/{chromosome}.name.png"), - reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" - threads: 4 - params: - app_path=config['app.path'] - shell: - """ - ## Odgi 1D viz - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P - - ## Getting legend from contig figure - convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} - - odgheight=$(identify -ping -format '%h' {output.odgifig}) - ctgheight=$(identify -ping -format '%h' {input.contigfig}) - - combinedheight=$(($odgheight+$ctgheight)) - - echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" - - ## Creating empty canvas ad adding other figures to it - convert -size "3300x$combinedheight" xc:white {output.reportfig} - composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} - composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} - composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} - """ - -rule create_pan1c_report: - # Produces a markdown report of chromosomes graphs - input: - figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), - genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" - output: - reportfig="output/pan1c."+config['name']+".report.md" - threads: 4 - params: - app_path=config['app.path'] - run: - shell("touch {output.reportfig}") - - # Adding Summary - ## To Do - - # Adding General stats - shell("echo '# General stats' >> {output.reportfig}") - shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - - # Adding Path stats - shell("echo '# Path stats' >> {output.reportfig}") - shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - - # Adding chromosomes figures - fig_list = [fig for fig in input.figs] - print(fig_list) - fig_list.sort() - - shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") - for fig in fig_list: - basename=os.path.basename(fig) - chr_name=basename.split('.')[1] - - shell("echo '## {chr_name}' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") \ No newline at end of file diff --git a/rules/pre-processing.smk b/rules/pre-processing.smk deleted file mode 100644 index 3580108..0000000 --- a/rules/pre-processing.smk +++ /dev/null @@ -1,185 +0,0 @@ -""" -Pre-processing section : preparing haplotypes ---------------------------------------------------- -""" -rule ragtag_scaffolding: - # Scaffold input haplotype against the reference to infer chromosome scale sequences - input: - ref="data/haplotypes/"+config['reference'], - reffai="data/haplotypes/"+config['reference']+".fai", - refgzi="data/haplotypes/"+config['reference']+".gzi", - fa="data/haplotypes/{haplotype}.fa.gz" - output: - fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", - tar="data/hap.ragtagged/{haplotype}.tar.gz" - threads: 8 - retries: 1 - params: - app_path=config["app.path"] - shell: - """ - bash scripts/ragtagChromInfer.sh \ - -d $(dirname {output.fa})/{wildcards.haplotype} \ - -a {params.app_path} \ - -t {threads} \ - -r {input.ref} \ - -q {input.fa} \ - -o {output.fa} - - #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then - # echo "Error : Empty final fasta" - # exit 1 - #fi - """ - -rule quast_stats: - # Run Quast on ragtagged genomes - input: - fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), - ref="data/haplotypes/"+config['reference'] - output: - report="output/quast/"+config['name']+".quast.report.html" - threads: 8 - params: - app_path=config["app.path"], - pan_name=config["name"] - shell: - """ - apptainer run {params.app_path}/pan1c-env.sif quast.py \ - -t {threads} \ - -o "$(dirname {output.report})" \ - -r {input.ref} \ - --plots-format png \ - --no-read-stats \ - --no-icarus \ - -s \ - --large \ - {input.fas} - - # Compressing temporary files - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} $(dirname {output.report})/contigs_reports/*.fa - apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* - - mv $(dirname {output.report})/report.html {output.report} - """ - -rule contig_position: - # Produce figures with contig positions - input: - fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", - fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" - output: - fig="output/chr.contig/{chromosome}.contig.png", - outdir=directory("output/chr.contig/{chromosome}") - threads: 1 - params: - app_path=config["app.path"] - shell: - """ - mkdir {output.outdir} - base="{output.outdir}" - hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') - - # Creating .bands file - apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ - -s 5 \ - -f {input.fa} \ - > $base/{wildcards.chromosome}.gff - - #cat $base/{wildcards.chromosome}.gff - - awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ - sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ - > $base/{wildcards.chromosome}.bands - - #cat $base/{wildcards.chromosome}.bands - - # Creating bed like file - cat {input.fai} | \ - cut -f1,2 | tr '\\t' ',' | \ - sed "s/,/,1,/" | \ - sed "1i chr\\tstart\\tend" | \ - tr ',' '\\t' \ - > $base/{wildcards.chromosome}.tsv - - #cat $base/{wildcards.chromosome}.tsv - - # Creating figure - apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ - -b $base/{wildcards.chromosome}.bands \ - -t $base/{wildcards.chromosome}.tsv \ - -o {output.fig} - """ - -rule clustering: - # Read ragtagged fastas and split chromosome sequences into according bins - input: - expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) - output: - expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) - threads: workflow.cores - params: - app_path=config["app.path"], - pan_name=config["name"] - shell: - """ - mkdir -p $(dirname {output[0]}) - apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ - --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} - for file in $(dirname {output[0]})/*.fa; do - apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file - done - """ - -rule create_allAsm_syri_fig: - # Create a SyRI figure containing all input assemblies - input: - ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", - qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) - output: - fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", - wrkdir=directory('data/asm.syri/all') - threads: 8 - params: - app_path=config["app.path"] - shell: - """ - mkdir {output.wrkdir} - - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {input.ref} \ - -q "{input.qry}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - """ - -rule create_sglAsm_syri_fig: - # Create a SyRI figure for a single input assembly - input: - ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", - qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" - output: - fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", - wrkdir=directory('data/asm.syri/{haplotype}') - threads: 4 - retries: 1 - resources: - mem_gb=48 - params: - app_path=config["app.path"] - shell: - """ - mkdir -p {output.wrkdir} - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {input.ref} \ - -q "{input.qry}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - """ \ No newline at end of file -- GitLab From bedb03d62a3269ca46ff09bddbddfda40bd40bf3 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 17:05:11 +0200 Subject: [PATCH 38/86] Contact & default wfmash parameters --- README.md | 3 ++- config.yaml | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e2f9df2..04529fb 100644 --- a/README.md +++ b/README.md @@ -114,5 +114,6 @@ Pan1c-06AT-v3 This DAG shows the worflow for a pangenome of `Saccharomyces cerevisiae` using the `R64` reference.  - +# Contact +[alexis.mergez@inrae.fr](mailto:alexis.mergez@inrae.fr) diff --git a/config.yaml b/config.yaml index 3c09fb6..37caa44 100644 --- a/config.yaml +++ b/config.yaml @@ -8,8 +8,8 @@ app.path: '<path>' # Core parameters # Wfmash alignement parameters : -wfmash.segment_length: 5000 -wfmash.mapping_id: 90 +wfmash.segment_length: 10000 +wfmash.mapping_id: 95 wfmash.secondary: '-k 19 -H 0.001 -X' # Seqwish parameters -- GitLab From 74593b33bbd6cea30642986d906b2983bbb82b5c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 17:10:17 +0200 Subject: [PATCH 39/86] Changed Odgi viz default parameters --- config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yaml b/config.yaml index 37caa44..5fe91c1 100644 --- a/config.yaml +++ b/config.yaml @@ -16,8 +16,8 @@ wfmash.secondary: '-k 19 -H 0.001 -X' seqwish.params: '-B 10000000 -k 19 -f 0' # Odgi 1D and path coverage viz parameters -odgi.1Dviz.params: '-x 2000 -a 150 -b' -odgi.pcov.params: '-x 2000 -a 150 -O' +odgi.1Dviz.params: '-x 2000 -b' +odgi.pcov.params: '-x 2000 -O' ## Optional parts of the workflow # Running Quast to get statistics on input haplotypes -- GitLab From cf4ca5e948545e498ac3a4f27f10b114489dffcf Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 17:10:32 +0200 Subject: [PATCH 40/86] Added hiddenfile to keep data folder --- data/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/.gitkeep diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 -- GitLab From f1ca7b8a10f8b7551db9d7244f375802c0c726e4 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 26 Jun 2024 17:25:13 +0200 Subject: [PATCH 41/86] Moving bgzipping to dedicated rule --- Snakefile | 4 +++- rules/tools.smk | 17 ++++++++++++++++- scripts/ragtagChromInfer.sh | 6 +++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index e3818f2..fc342e4 100644 --- a/Snakefile +++ b/Snakefile @@ -73,6 +73,7 @@ def which_analysis(): """ Rules ------------------------------------------------------------------------------------------- """ + # Main target rule rule all: input: @@ -83,6 +84,7 @@ rule all: """ Pre-processing section : preparing pggb inputs --------------------------------------------------- """ + rule ragtag_scaffolding: # Scaffold input haplotype against the reference to infer chromosome scale sequences input: @@ -91,7 +93,7 @@ rule ragtag_scaffolding: refgzi="data/haplotypes/"+config['reference']+".gzi", fa="data/haplotypes/{haplotype}.fa.gz" output: - fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", + fa=temp("data/hap.ragtagged/{haplotype}.ragtagged.fa"), tar="data/hap.ragtagged/{haplotype}.tar.gz" threads: 8 retries: 1 diff --git a/rules/tools.smk b/rules/tools.smk index 7528154..5aab858 100644 --- a/rules/tools.smk +++ b/rules/tools.smk @@ -13,4 +13,19 @@ rule samtools_index: app_path=config["app.path"] shell: "apptainer run --app samtools {params.app_path}/PanGeTools.sif " - "faidx {input.fa}" \ No newline at end of file + "faidx {input.fa}" + +rule run_bgzip: + # Run BGZIP on the file + input: + "{file}" + output: + "{file}.gz" + threads: 4 + params: + app_path=config["app.path"] + shell: + """ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} + """ \ No newline at end of file diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index 9afa538..b8dc8ea 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -41,11 +41,11 @@ grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ sed "s/${ref}#chr\([^_]*\)_RagTag/${hapID}#chr\1/g" > $tmpdir/${sample}.ragtagged.fa # Compressing fasta -apptainer run --app bgzip $appdir/PanGeTools.sif \ - -@ $threads $tmpdir/${sample}.ragtagged.fa +# apptainer run --app bgzip $appdir/PanGeTools.sif \ +# -@ $threads $tmpdir/${sample}.ragtagged.fa # Moving fa.gz to output dir -mv $tmpdir/${sample}.ragtagged.fa.gz $output +mv $tmpdir/${sample}.ragtagged.fa $output # Compressing temporary files tar --remove-files -cf $tmpdir.tar $tmpdir -- GitLab From 194a1d59c03af6f5211561118da7c91eb985b1ac Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 10:36:18 +0200 Subject: [PATCH 42/86] Update Snakefile Offloading chromosome FASTA compression from clustering rule --- Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index fc342e4..028e807 100644 --- a/Snakefile +++ b/Snakefile @@ -201,8 +201,8 @@ rule clustering: input: expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: - expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) - threads: workflow.cores + temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST)) + threads: 2 params: app_path=config["app.path"], pan_name=config["name"] @@ -211,9 +211,9 @@ rule clustering: mkdir -p $(dirname {output[0]}) apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} - for file in $(dirname {output[0]})/*.fa; do - apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file - done + #for file in $(dirname {output[0]})/*.fa; do + # apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file + #done """ rule create_allAsm_syri_fig: -- GitLab From 50f491896dfd9066cefdb1230891d1d2c9e9f7a2 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 11:13:21 +0200 Subject: [PATCH 43/86] Update Snakefile - Removed SyrI rule to get an All assembly figure ( will be replaced with a run of SyRI on chromosomes only instead of all assemblies) - Renamed some rules for clarity - Adding more elements to the final report --- Snakefile | 105 +++++++++++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/Snakefile b/Snakefile index 028e807..b8f2cdb 100644 --- a/Snakefile +++ b/Snakefile @@ -196,8 +196,8 @@ rule contig_position: -o {output.fig} """ -rule clustering: - # Read ragtagged fastas and split chromosome sequences into according bins +rule chromosome_clustering: + # Read ragtagged fastas and split chromosome sequences into according FASTA files input: expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: @@ -216,33 +216,9 @@ rule clustering: #done """ -rule create_allAsm_syri_fig: - # Create a SyRI figure containing all input assemblies - input: - ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", - qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) - output: - fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", - wrkdir=directory('data/asm.syri/all') - threads: 8 - params: - app_path=config["app.path"] - shell: - """ - mkdir {output.wrkdir} - - bash scripts/getSyriFigs.sh \ - -a {params.app_path} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {input.ref} \ - -q "{input.qry}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - """ - -rule create_sglAsm_syri_fig: - # Create a SyRI figure for a single input assembly +rule SyRI_on_ASM: + # Run SyRI on a single assembly. + # The assembly is mapped on the 'reference' and SyRI search for SV. input: ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" @@ -269,12 +245,11 @@ rule create_sglAsm_syri_fig: """ """ - Core section : Running PGGB - """ -rule create_pggb_input_syri_fig: +rule SyRI_on_chrInput: + # WIP input: fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: @@ -700,43 +675,75 @@ rule create_pan1c_report_fig: composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} """ +def get_report_section(): + """ + Return 'create_pan1c_report' optional inputs to add them to the final report. + For example : + - SyRI figures on Assemblies + """ + sections = dict() + + sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata" + + if config["get_ASMs_SyRI"] == "True": + sections["SyRI_on_ASMs_figs"] = expand( + "output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", + haplotype=SAMPLES_NOREF + ) + + return sections + + rule create_pan1c_report: # Produces a markdown report of chromosomes graphs input: - figs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), + odgifigs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv", + unpack(get_report_section) output: - reportfig="output/pan1c."+config['name']+".report.md" + report="output/pan1c."+config['name']+".report.md" threads: 4 params: app_path=config['app.path'] run: - shell("touch {output.reportfig}") + shell("touch {output.report}") + + # Adding Summary (made for importing in Joplin) + shell("echo '# Summary' >> {output.report}") + shell("echo '[TOC]' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding graph construction info + ## WIP - # Adding Summary - ## To Do + # Adding metadata + shell("echo '# Graph metadata' >> {output.report}") + shell("cat {input.metadata} >> {output.report}") + shell("echo '' >> {output.report}") # Adding General stats - shell("echo '# General stats' >> {output.reportfig}") - shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") + shell("echo '# General stats' >> {output.report}") + shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.report}") + shell("echo '' >> {output.report}") # Adding Path stats - shell("echo '# Path stats' >> {output.reportfig}") - shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") + shell("echo '# Path stats' >> {output.report}") + shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.report}") + shell("echo '' >> {output.report}") # Adding chromosomes figures - fig_list = [fig for fig in input.figs] + fig_list = [fig for fig in input.odgifigs] print(fig_list) fig_list.sort() - shell("echo '# Chromosome-scale odgi graphs' >> {output.reportfig}") + shell("echo '# Chromosome-scale odgi graphs' >> {output.report}") for fig in fig_list: basename=os.path.basename(fig) chr_name=basename.split('.')[1] - shell("echo '## {chr_name}' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") - shell("echo '' >> {output.reportfig}") \ No newline at end of file + shell("echo '## {chr_name}' >> {output.report}") + shell("echo '' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding SyRI figs if produced \ No newline at end of file -- GitLab From ef1345e67bd8da52e07b0b48ee0e5f46f09b80de Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 11:21:23 +0200 Subject: [PATCH 44/86] Update Snakefile Added addition of SyRI figs in the final report if wanted --- Snakefile | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index b8f2cdb..fc795e0 100644 --- a/Snakefile +++ b/Snakefile @@ -705,7 +705,8 @@ rule create_pan1c_report: report="output/pan1c."+config['name']+".report.md" threads: 4 params: - app_path=config['app.path'] + app_path=config['app.path'], + add_SyRI=config['get_ASMs_SyRI'] run: shell("touch {output.report}") @@ -746,4 +747,18 @@ rule create_pan1c_report: shell("echo '' >> {output.report}") shell("echo '' >> {output.report}") - # Adding SyRI figs if produced \ No newline at end of file + # Adding SyRI figs if produced + if params.add_SyRI: + + fig_list = [fig for fig in input.SyRI_on_ASMs_figs] + fig_list.sort() + + shell("echo '# SyRI on input assemblies' >> {output.report}") + for fig in fig_list: + basename = os.path.basename(fig) + hap_name = basename.split('.')[2:4] + + shell("echo '## {hap_name[0]}, {hap_name[1]}' >> {output.report}") + shell("echo '' >> {output.report}") + + shell("echo '' >> {output.report}") \ No newline at end of file -- GitLab From 040bfb9b579aaec0ff9093b4be44fa3d29e6ecc3 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 11:23:45 +0200 Subject: [PATCH 45/86] Update Snakefile --- Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index fc795e0..e14cd2d 100644 --- a/Snakefile +++ b/Snakefile @@ -675,7 +675,7 @@ rule create_pan1c_report_fig: composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} """ -def get_report_section(): +def get_report_sections(): """ Return 'create_pan1c_report' optional inputs to add them to the final report. For example : @@ -684,6 +684,9 @@ def get_report_section(): sections = dict() sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata" + sections["odgifigs"] = expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST) + sections["genstats"] = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + sections["pathstats"] = "output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" if config["get_ASMs_SyRI"] == "True": sections["SyRI_on_ASMs_figs"] = expand( @@ -697,10 +700,7 @@ def get_report_section(): rule create_pan1c_report: # Produces a markdown report of chromosomes graphs input: - odgifigs=expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST), - genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv", - unpack(get_report_section) + unpack(get_report_sections) output: report="output/pan1c."+config['name']+".report.md" threads: 4 -- GitLab From b87480f4c9c5fa6bc8e63ae2fe05fdda5207b25b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 11:24:25 +0200 Subject: [PATCH 46/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index e14cd2d..5372649 100644 --- a/Snakefile +++ b/Snakefile @@ -675,7 +675,7 @@ rule create_pan1c_report_fig: composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} """ -def get_report_sections(): +def get_report_sections(wildcards): """ Return 'create_pan1c_report' optional inputs to add them to the final report. For example : -- GitLab From 6266a20a58ead08340ffc2cf5e81af09ee12be68 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 27 Jun 2024 16:37:14 +0200 Subject: [PATCH 47/86] Update Snakefile Added threads to stats jobs --- Snakefile | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/Snakefile b/Snakefile index 5372649..1610dd6 100644 --- a/Snakefile +++ b/Snakefile @@ -202,7 +202,7 @@ rule chromosome_clustering: expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST)) - threads: 2 + threads: 1 params: app_path=config["app.path"], pan_name=config["name"] @@ -531,6 +531,7 @@ rule aggregate_graphs_stats: output: genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + threads: 1 params: app_path=config['app.path'], pan_name=config['name'] @@ -564,6 +565,7 @@ rule pggb_input_stats: flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" output: "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" + threads: 1 params: app_path=config['app.path'], pan_name=config['name'] @@ -576,11 +578,12 @@ rule pggb_input_stats: rule core_statistics: # Aggregate chrInput, chrGraph and pggb statistics into a single tsv input: - chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", - chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + chrInputStats = "output/stats/pan1c."+config['name']+".chrInput.stats.tsv", + chrGraphStats = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" output: - tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", - dir=directory("output/pggb.usage.figs") + tsv = "output/stats/pan1c."+config['name']+".core.stats.tsv", + dir = directory("output/pggb.usage.figs") + threads: 1 params: app_path=config['app.path'], pan_name=config['name'] @@ -593,12 +596,7 @@ rule core_statistics: """ """ - -Post-processing section : - The graph for each chromosome are made as well as some basic statistics. - In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...). - It also contains rules to use the graph itself. - +Post-processing section """ rule get_pav: # Create PAV matrix readable by panache for a given chromosome scale graph @@ -694,8 +692,7 @@ def get_report_sections(wildcards): haplotype=SAMPLES_NOREF ) - return sections - + return sections rule create_pan1c_report: # Produces a markdown report of chromosomes graphs -- GitLab From fb1938c8f1b2a79b4f100f5bcba6d4ead6d3b46a Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 3 Jul 2024 09:26:26 +0200 Subject: [PATCH 48/86] Update Snakefile Added logs to more steps : - Panacus - Squeeze - SyRI - Quast - RagTag --- Snakefile | 51 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/Snakefile b/Snakefile index 1610dd6..7e5773a 100644 --- a/Snakefile +++ b/Snakefile @@ -99,15 +99,20 @@ rule ragtag_scaffolding: retries: 1 params: app_path=config["app.path"] + log: + cmd="logs/ragtag/{haplotype}.ragtag.cmd.log", + time="logs/ragtag/{haplotype}.ragtag.time.log" shell: """ - bash scripts/ragtagChromInfer.sh \ + /usr/bin/time -v -o {log.time} \ + bash scripts/ragtagChromInfer.sh \ -d $(dirname {output.fa})/{wildcards.haplotype} \ -a {params.app_path} \ -t {threads} \ -r {input.ref} \ -q {input.fa} \ - -o {output.fa} + -o {output.fa} 2>&1 | \ + tee {log.cmd} #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then # echo "Error : Empty final fasta" @@ -126,9 +131,13 @@ rule quast_stats: params: app_path=config["app.path"], pan_name=config["name"] + log: + cmd="logs/quast/{haplotype}.quast.cmd.log", + time="logs/quast/{haplotype}.quast.time.log" shell: """ - apptainer run {params.app_path}/pan1c-env.sif quast.py \ + /usr/bin/time -v -o {log.time} \ + apptainer run {params.app_path}/pan1c-env.sif quast.py \ -t {threads} \ -o "$(dirname {output.report})" \ -r {input.ref} \ @@ -137,7 +146,8 @@ rule quast_stats: --no-icarus \ -s \ --large \ - {input.fas} + {input.fas} 2>&1 | \ + tee {log.cmd} # Compressing temporary files apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ @@ -211,9 +221,6 @@ rule chromosome_clustering: mkdir -p $(dirname {output[0]}) apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} - #for file in $(dirname {output[0]})/*.fa; do - # apptainer run --app bgzip {params.app_path}/PanGeTools.sif -@ {threads} $file - #done """ rule SyRI_on_ASM: @@ -225,6 +232,9 @@ rule SyRI_on_ASM: output: fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", wrkdir=directory('data/asm.syri/{haplotype}') + log: + cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log", + time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log" threads: 4 retries: 1 resources: @@ -234,13 +244,16 @@ rule SyRI_on_ASM: shell: """ mkdir -p {output.wrkdir} - bash scripts/getSyriFigs.sh \ + /usr/bin/time -v -o {log.time} \ + bash scripts/getSyriFigs.sh \ -a {params.app_path} \ -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ -r {input.ref} \ - -q "{input.qry}" + -q "{input.qry}" 2>&1 | \ + tee {log.cmd} + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} """ @@ -470,15 +483,22 @@ rule graph_squeeze: graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) output: "output/pan1c."+config['name']+".gfa" + log: + cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log", + time="logs/squeeze/"+config['name']+".squeeze.time.log", threads: 16 params: app_path=config['app.path'] shell: """ - apptainer run --app odgi {params.app_path}/PanGeTools.sif \ - squeeze -t {threads} -O -P -f {input.glist} -o {output}.og + /usr/bin/time -v -o {log.time} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + squeeze -t {threads} -O -P -f {input.glist} -o {output}.og 2>&1 | \ + tee {log.cmd} + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ view -t {threads} -P -i {output}.og -g > {output} + rm {output}.og """ @@ -622,6 +642,9 @@ rule panacus_stats: graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' output: html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' + log: + cmd="logs/panacus/{chromosome}.panacus.cmd.log", + time="logs/panacus/{chromosome}.panacus.time.log" params: app_path=config['app.path'], pan_name=config['name'], @@ -629,13 +652,15 @@ rule panacus_stats: threads: 8 shell: """ - bash scripts/getPanacusHG.sh \ + /usr/bin/time -v -o {log.time} \ + bash scripts/getPanacusHG.sh \ -g {input.graph} \ -r $(basename {params.refname} .fa.gz) \ -d data/chrGraphs/{wildcards.chromosome} \ -o {output.html} \ -a {params.app_path} \ - -t {threads} + -t {threads} 2>&1 | \ + tee {log.cmd} """ rule create_pan1c_report_fig: -- GitLab From c6c377917d516431653b6d489b00826a51f04630 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Wed, 3 Jul 2024 09:28:29 +0200 Subject: [PATCH 49/86] Update Snakefile --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 7e5773a..5839de7 100644 --- a/Snakefile +++ b/Snakefile @@ -132,8 +132,8 @@ rule quast_stats: app_path=config["app.path"], pan_name=config["name"] log: - cmd="logs/quast/{haplotype}.quast.cmd.log", - time="logs/quast/{haplotype}.quast.time.log" + cmd="logs/quast/quast.cmd.log", + time="logs/quast/quast.time.log" shell: """ /usr/bin/time -v -o {log.time} \ -- GitLab From 61f7281cf117f239d2af017a223011a68e599285 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 4 Jul 2024 09:10:09 +0200 Subject: [PATCH 50/86] Update tools.smk --- rules/tools.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/tools.smk b/rules/tools.smk index 5aab858..53fe1ff 100644 --- a/rules/tools.smk +++ b/rules/tools.smk @@ -27,5 +27,5 @@ rule run_bgzip: shell: """ apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ - -@ {threads} + -@ {threads} {input} """ \ No newline at end of file -- GitLab From 2ca1fc4e14a6e75a4f671ffe5b1cd4fed4a3f518 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 4 Jul 2024 15:30:37 +0200 Subject: [PATCH 51/86] Update Snakefile --- Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Snakefile b/Snakefile index 5839de7..1abfc61 100644 --- a/Snakefile +++ b/Snakefile @@ -742,7 +742,9 @@ rule create_pan1c_report: # Adding metadata shell("echo '# Graph metadata' >> {output.report}") + shell("echo '```' >> {output.report}") shell("cat {input.metadata} >> {output.report}") + shell("echo '```' >> {output.report}") shell("echo '' >> {output.report}") # Adding General stats -- GitLab From a7593a90e16c94afdf7baf21526441fde10f100e Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:14:29 +0200 Subject: [PATCH 52/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 1abfc61..ac6421f 100644 --- a/Snakefile +++ b/Snakefile @@ -279,7 +279,7 @@ rule SyRI_on_chrInput: ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ - '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' + '/^>/ {{OUT="{output.wrkdir}" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' ## Getting the list of sequences asmList=() -- GitLab From 33802127ef3fed52872ba2709c92dfdad977b891 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:16:07 +0200 Subject: [PATCH 53/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index ac6421f..793aef4 100644 --- a/Snakefile +++ b/Snakefile @@ -294,7 +294,7 @@ rule SyRI_on_chrInput: -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ - -r {output.wrkdir}/"${refname}.fa" \ + -r {output.wrkdir}/"${{refname}}.fa" \ -q "${asmList[@]}" mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa -- GitLab From 2e8e05af134a7d7e231257267a7f65aaaa929ba1 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:16:51 +0200 Subject: [PATCH 54/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 793aef4..499b8dd 100644 --- a/Snakefile +++ b/Snakefile @@ -295,7 +295,7 @@ rule SyRI_on_chrInput: -d {output.wrkdir} \ -o $(basename {output.fig}) \ -r {output.wrkdir}/"${{refname}}.fa" \ - -q "${asmList[@]}" + -q "${{asmList[@]}}" mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa """ -- GitLab From 02dcafaeed179468e840d1d6878c67c1decd7704 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:18:48 +0200 Subject: [PATCH 55/86] Update Snakefile --- Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Snakefile b/Snakefile index 499b8dd..2b49ce5 100644 --- a/Snakefile +++ b/Snakefile @@ -280,6 +280,8 @@ rule SyRI_on_chrInput: ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ '/^>/ {{OUT="{output.wrkdir}" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' + + grep '>' {output.wrkdir}/*.fa ## Getting the list of sequences asmList=() -- GitLab From 13721c88cdee86ad13bf4d83537897387bd7a059 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:21:06 +0200 Subject: [PATCH 56/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 2b49ce5..26a0210 100644 --- a/Snakefile +++ b/Snakefile @@ -267,7 +267,7 @@ rule SyRI_on_chrInput: fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", - wrkdir=directory('data/chrInput/syri/{chromosome}') + wrkdir=directory('data/chrInputs/syri/{chromosome}/') threads: 8 params: app_path=config["app.path"], -- GitLab From f3a90a07bb9f4676d882cf79efefb7fbfd2ef28b Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:23:50 +0200 Subject: [PATCH 57/86] Update Snakefile --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 26a0210..9460fbf 100644 --- a/Snakefile +++ b/Snakefile @@ -267,7 +267,7 @@ rule SyRI_on_chrInput: fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", - wrkdir=directory('data/chrInputs/syri/{chromosome}/') + wrkdir=directory('data/chrInputs/syri/{chromosome}') threads: 8 params: app_path=config["app.path"], @@ -279,7 +279,7 @@ rule SyRI_on_chrInput: ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ - '/^>/ {{OUT="{output.wrkdir}" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' + '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' grep '>' {output.wrkdir}/*.fa -- GitLab From be9cbe44ae911968e43e54fa8ab4755342c73977 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:28:34 +0200 Subject: [PATCH 58/86] Update Snakefile --- Snakefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 9460fbf..84f47f6 100644 --- a/Snakefile +++ b/Snakefile @@ -268,7 +268,7 @@ rule SyRI_on_chrInput: output: fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", wrkdir=directory('data/chrInputs/syri/{chromosome}') - threads: 8 + threads: workflow.cores params: app_path=config["app.path"], ref=config['reference'] @@ -282,6 +282,8 @@ rule SyRI_on_chrInput: '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' grep '>' {output.wrkdir}/*.fa + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} {output.wrkdir}/*.fa ## Getting the list of sequences asmList=() -- GitLab From 7ce85183a84af0db35d88c1d054d74ffd2353241 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:29:46 +0200 Subject: [PATCH 59/86] Update Snakefile --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 84f47f6..0820f13 100644 --- a/Snakefile +++ b/Snakefile @@ -287,8 +287,8 @@ rule SyRI_on_chrInput: ## Getting the list of sequences asmList=() - for file in {output.wrkdir}/*.fa; do - asm="$(basename $file .fa | cut -f1 -d"#").fa" + for file in {output.wrkdir}/*.fa.gz; do + asm="$(basename $file .fa.gz | cut -f1 -d"#").fa.gz" mv $file "$(dirname $file)/$asm" asmList+=("$asm") done -- GitLab From a02ff604b899b089858b0d0db5241fb836405f45 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:30:59 +0200 Subject: [PATCH 60/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 0820f13..c6dea3b 100644 --- a/Snakefile +++ b/Snakefile @@ -298,7 +298,7 @@ rule SyRI_on_chrInput: -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ - -r {output.wrkdir}/"${{refname}}.fa" \ + -r {output.wrkdir}/"${{refname}}.fa.gz" \ -q "${{asmList[@]}}" mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa -- GitLab From 16a9997590055486ddf2fdae9cf4c93ec6c69748 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:32:25 +0200 Subject: [PATCH 61/86] Update Snakefile --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index c6dea3b..eafe179 100644 --- a/Snakefile +++ b/Snakefile @@ -281,10 +281,10 @@ rule SyRI_on_chrInput: zcat {input.fasta} | awk -F"#" \ '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' - grep '>' {output.wrkdir}/*.fa apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} {output.wrkdir}/*.fa - + zgrep '>' {output.wrkdir}/*.fa + ## Getting the list of sequences asmList=() for file in {output.wrkdir}/*.fa.gz; do -- GitLab From 701dd233cc9e506944941ae965bc755d8639aab3 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:33:48 +0200 Subject: [PATCH 62/86] Update Snakefile --- Snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index eafe179..e3977c5 100644 --- a/Snakefile +++ b/Snakefile @@ -281,9 +281,10 @@ rule SyRI_on_chrInput: zcat {input.fasta} | awk -F"#" \ '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' + ls apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} {output.wrkdir}/*.fa - zgrep '>' {output.wrkdir}/*.fa + zgrep '>' {output.wrkdir}/*.fa.gz ## Getting the list of sequences asmList=() -- GitLab From 4e32c41d2e119d15819408954b17bdca064e4cdf Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:37:29 +0200 Subject: [PATCH 63/86] Update Snakefile --- Snakefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index e3977c5..f3d98c9 100644 --- a/Snakefile +++ b/Snakefile @@ -281,7 +281,6 @@ rule SyRI_on_chrInput: zcat {input.fasta} | awk -F"#" \ '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' - ls apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} {output.wrkdir}/*.fa zgrep '>' {output.wrkdir}/*.fa.gz @@ -291,7 +290,7 @@ rule SyRI_on_chrInput: for file in {output.wrkdir}/*.fa.gz; do asm="$(basename $file .fa.gz | cut -f1 -d"#").fa.gz" mv $file "$(dirname $file)/$asm" - asmList+=("$asm") + asmList+=("$(dirname $file)/$asm") done bash scripts/getSyriFigs.sh \ -- GitLab From 4892e9bc9d170ddd8ebfbb159903ec5d5add4eab Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 11:53:38 +0200 Subject: [PATCH 64/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 3baf716..dc4b0f8 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -36,6 +36,8 @@ for item in "${temp[@]}"; do fi done +echo "${asmList[@]}" + # Array to store the created syri files syriFileList=() @@ -45,8 +47,8 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Setting filepaths for later bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam" syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" - #hapID=$(basename ${asmList[i + 1]} .ragtagged.fa.gz | sed 's/.hap/#/') - #refID=$(basename ${asmList[i]} .ragtagged.fa.gz | sed 's/.hap/#/') + hapID=$(basename ${asmList[i + 1]} .ragtagged.fa.gz | sed 's/.hap/#/') + refID=$(basename ${asmList[i]} .ragtagged.fa.gz | sed 's/.hap/#/') # Debug echo -e "[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID" -- GitLab From 6a09682cf95f923893caef5a4fad3cc2cd034dc7 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 13:28:42 +0200 Subject: [PATCH 65/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index dc4b0f8..5e1ed5a 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -36,7 +36,7 @@ for item in "${temp[@]}"; do fi done -echo "${asmList[@]}" +echo -e "${asmList[@]}" # Array to store the created syri files syriFileList=() -- GitLab From a263bd73bbada19908622df4c4cabed1c1adb87f Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 13:33:19 +0200 Subject: [PATCH 66/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 5e1ed5a..724c1fd 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -36,7 +36,7 @@ for item in "${temp[@]}"; do fi done -echo -e "${asmList[@]}" +echo "The array : ${asmList[@]}" # Array to store the created syri files syriFileList=() -- GitLab From bbe34e4ed6eb8e7c7c4aaab324ce126b336425d4 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 13:36:58 +0200 Subject: [PATCH 67/86] Update Snakefile --- Snakefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index f3d98c9..188937d 100644 --- a/Snakefile +++ b/Snakefile @@ -286,12 +286,14 @@ rule SyRI_on_chrInput: zgrep '>' {output.wrkdir}/*.fa.gz ## Getting the list of sequences - asmList=() + AllAsmList=() for file in {output.wrkdir}/*.fa.gz; do asm="$(basename $file .fa.gz | cut -f1 -d"#").fa.gz" mv $file "$(dirname $file)/$asm" - asmList+=("$(dirname $file)/$asm") + AllAsmList+=("$(dirname $file)/$asm") done + + echo "The ASM Array : ${{AllAsmList[@]}}" bash scripts/getSyriFigs.sh \ -a {params.app_path} \ @@ -299,7 +301,7 @@ rule SyRI_on_chrInput: -d {output.wrkdir} \ -o $(basename {output.fig}) \ -r {output.wrkdir}/"${{refname}}.fa.gz" \ - -q "${{asmList[@]}}" + -q "${{AllAsmList[@]}}" mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa """ -- GitLab From 02d0e4b733af1958ab221caab1effab9f4b6f2ae Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 13:44:59 +0200 Subject: [PATCH 68/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 724c1fd..dc70828 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -28,7 +28,10 @@ done # Reading query argument and creating an array containing the path to query fasta files IFS=' ' read -r -a temp <<< "$qry" +echo "tempArr : ${temp[@]}" + asmList=("$ref") + # Sorting the array to put the reference in first for item in "${temp[@]}"; do if [[ $item != "$ref" ]]; then -- GitLab From 0b15696608296670320bf294490400581d54a41c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 13:46:52 +0200 Subject: [PATCH 69/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index dc70828..fe81ce8 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -28,6 +28,7 @@ done # Reading query argument and creating an array containing the path to query fasta files IFS=' ' read -r -a temp <<< "$qry" +echo "Query : $qry" echo "tempArr : ${temp[@]}" asmList=("$ref") -- GitLab From bb6529350be5e348fbdd3a8342c795582c6f6fd2 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 13:55:22 +0200 Subject: [PATCH 70/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 188937d..ee9cc90 100644 --- a/Snakefile +++ b/Snakefile @@ -301,7 +301,7 @@ rule SyRI_on_chrInput: -d {output.wrkdir} \ -o $(basename {output.fig}) \ -r {output.wrkdir}/"${{refname}}.fa.gz" \ - -q "${{AllAsmList[@]}}" + -q "${{AllAsmList[*]}}" mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa """ -- GitLab From 119d8e1d880e6fac876738e2cabe8aee8e304b09 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 14:09:09 +0200 Subject: [PATCH 71/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index ee9cc90..e3491b4 100644 --- a/Snakefile +++ b/Snakefile @@ -279,7 +279,7 @@ rule SyRI_on_chrInput: ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ - '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' + '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".hap" substr($0,3) ".fa"}}; {{print >> OUT; close(OUT)}}' apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} {output.wrkdir}/*.fa -- GitLab From 67e49a0c1e995e74646658f824d173d96be0b133 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 14:19:04 +0200 Subject: [PATCH 72/86] Update Snakefile --- Snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index e3491b4..46ff6a2 100644 --- a/Snakefile +++ b/Snakefile @@ -279,16 +279,16 @@ rule SyRI_on_chrInput: ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ - '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".hap" substr($0,3) ".fa"}}; {{print >> OUT; close(OUT)}}' + '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} {output.wrkdir}/*.fa - zgrep '>' {output.wrkdir}/*.fa.gz + #zgrep '>' {output.wrkdir}/*.fa.gz ## Getting the list of sequences AllAsmList=() for file in {output.wrkdir}/*.fa.gz; do - asm="$(basename $file .fa.gz | cut -f1 -d"#").fa.gz" + asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\.hap/').fa.gz" mv $file "$(dirname $file)/$asm" AllAsmList+=("$(dirname $file)/$asm") done -- GitLab From b3cac8129237d833412e9b3e1c88b0e85c9f19cf Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 14:19:53 +0200 Subject: [PATCH 73/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 46ff6a2..68c535d 100644 --- a/Snakefile +++ b/Snakefile @@ -288,7 +288,7 @@ rule SyRI_on_chrInput: ## Getting the list of sequences AllAsmList=() for file in {output.wrkdir}/*.fa.gz; do - asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\.hap/').fa.gz" + asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\\.hap/').fa.gz" mv $file "$(dirname $file)/$asm" AllAsmList+=("$(dirname $file)/$asm") done -- GitLab From 93238a63287d70f5338bbaa97d5c16142d3cb743 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 14:29:21 +0200 Subject: [PATCH 74/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 68c535d..c03ec33 100644 --- a/Snakefile +++ b/Snakefile @@ -275,7 +275,7 @@ rule SyRI_on_chrInput: shell: """ mkdir {output.wrkdir} - refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) + refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2) ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ -- GitLab From 1ac7f9875caf20319afc040f1509c15de53b4439 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 14:42:54 +0200 Subject: [PATCH 75/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index fe81ce8..51ccfa5 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -27,20 +27,21 @@ done ## Main script # Reading query argument and creating an array containing the path to query fasta files IFS=' ' read -r -a temp <<< "$qry" +readarray -td '' temp_sorted < <(printf '%s\0' "${temp[@]}" | sort -z) -echo "Query : $qry" -echo "tempArr : ${temp[@]}" +#echo "Query : $qry" +#echo "tempArr : ${temp[@]}" asmList=("$ref") # Sorting the array to put the reference in first -for item in "${temp[@]}"; do +for item in "${temp_sorted[@]}"; do if [[ $item != "$ref" ]]; then asmList+=($item) fi done -echo "The array : ${asmList[@]}" +#echo "The array : ${asmList[@]}" # Array to store the created syri files syriFileList=() @@ -51,11 +52,11 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Setting filepaths for later bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam" syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" - hapID=$(basename ${asmList[i + 1]} .ragtagged.fa.gz | sed 's/.hap/#/') - refID=$(basename ${asmList[i]} .ragtagged.fa.gz | sed 's/.hap/#/') + hapID=$(basename ${asmList[i + 1]}) + refID=$(basename ${asmList[i]}) # Debug - echo -e "[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID" + echo -e "\n[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID\n" # Adding the output syri file to the array syriFileList+=($syriFile) -- GitLab From 13b49a01a058ac488f5e309a3f51deba49b9ae14 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 5 Jul 2024 18:05:13 +0200 Subject: [PATCH 76/86] Updated SyRI script Added width and height to SyRI script --- Snakefile | 3 ++- scripts/getSyriFigs.sh | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index c03ec33..e38f585 100644 --- a/Snakefile +++ b/Snakefile @@ -301,7 +301,8 @@ rule SyRI_on_chrInput: -d {output.wrkdir} \ -o $(basename {output.fig}) \ -r {output.wrkdir}/"${{refname}}.fa.gz" \ - -q "${{AllAsmList[*]}}" + -q "${{AllAsmList[*]}}" \ + -h 9 -w 16 mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa """ diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 51ccfa5..848e2e4 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -9,9 +9,11 @@ appdir="" # Directory containing apptainer images threads="" wrkdir="" # Working directory (directory used by pggb to store step files like .paf, etc...) output="" # Output Syri figure(s) +height=16 # Figure height +width=9 # Figure width ## Getting arguments -while getopts "r:q:a:t:d:o:" option; do +while getopts "r:q:a:t:d:o:h:w:" option; do case "$option" in r) ref="$OPTARG";; q) qry="$OPTARG";; @@ -19,7 +21,9 @@ while getopts "r:q:a:t:d:o:" option; do t) threads="$OPTARG";; d) wrkdir="$OPTARG";; o) output="$OPTARG";; - \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output]" >&2 + h) height="$OPTARG";; + w) width="$OPTARG";; + \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width]" >&2 exit 1;; esac done @@ -93,7 +97,7 @@ for asm in "${asmList[@]}"; do done # Generating the plotsr command -command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 " +command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H $height -W $width -d 600 " # Adding syri files to the command as each needs to be specified using "--sr" argument for file in "${syriFileList[@]}"; do -- GitLab From b59f1f312c49d5fa0af2fc035ef1b564d9a10589 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 8 Jul 2024 11:01:14 +0200 Subject: [PATCH 77/86] Added Syri on chrInputs --- Snakefile | 9 +++++++-- config.yaml | 2 +- example/config_CICD.yaml | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index e38f585..c493eea 100644 --- a/Snakefile +++ b/Snakefile @@ -50,9 +50,13 @@ def which_analysis(): if config["get_PAV"] == "True": # Adding PAV matrix creation analysis_inputs.append("output/pav.matrices") - if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures + if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly analysis_inputs.append( - expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), + expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF) + ) + if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input + analysis_input.append( + expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", chromosome=CHRLIST) ) if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( @@ -576,6 +580,7 @@ rule final_graph_tagging: output: "output/pan1c."+config['name']+".gfa.metadata" threads: 1 + priority: 100 params: app_path=config['app.path'], pan_name=config['name'] diff --git a/config.yaml b/config.yaml index 5fe91c1..0a8502a 100644 --- a/config.yaml +++ b/config.yaml @@ -27,7 +27,7 @@ get_contig_pos: 'True' # Computes Presence Absence Variant matrices for Panache (not recommended; very long) get_PAV: 'False' # Computes SyRI figures for haplotypes -#get_allASM_SyRI: 'False' # All vs all get_ASMs_SyRI: 'False' # Haplotype vs Reference +get_chrInputs_SyRI: 'False' # SyRI on chrInputs # Creating final report create_report: 'True' diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml index 648e686..cc0c279 100644 --- a/example/config_CICD.yaml +++ b/example/config_CICD.yaml @@ -28,6 +28,7 @@ get_contig_pos: 'True' get_PAV: 'False' # Computes SyRI figures for haplotypes #get_allASM_SyRI: 'False' # All vs all -get_ASMs_SyRI: 'False' # Haplotype vs Reference +get_ASMs_SyRI: 'True' # Haplotype vs Reference +get_chrInputs_SyRI: 'True' # SyRI on chrInputs # Creating final report create_report: 'True' -- GitLab From b4779c67f09da8ce25b0996a60aeadd37e33fe71 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Mon, 8 Jul 2024 11:06:41 +0200 Subject: [PATCH 78/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index c493eea..7420323 100644 --- a/Snakefile +++ b/Snakefile @@ -55,7 +55,7 @@ def which_analysis(): expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF) ) if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input - analysis_input.append( + analysis_inputs.append( expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", chromosome=CHRLIST) ) if config["run_Quast"] == "True": # Running Quast on input haplotypes -- GitLab From 867217fd644f7c180e3187f4754d34b3a87c426c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 10:40:14 +0200 Subject: [PATCH 79/86] Update Snakefile --- Snakefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Snakefile b/Snakefile index e38f585..a5d3272 100644 --- a/Snakefile +++ b/Snakefile @@ -97,6 +97,7 @@ rule ragtag_scaffolding: tar="data/hap.ragtagged/{haplotype}.tar.gz" threads: 8 retries: 1 + priority: 100 params: app_path=config["app.path"] log: @@ -213,6 +214,7 @@ rule chromosome_clustering: output: temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST)) threads: 1 + priority: 100 params: app_path=config["app.path"], pan_name=config["name"] @@ -316,6 +318,7 @@ rule wfmash_on_chr: mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") threads: 16 + priority: 100 params: app_path=config['app.path'], segment_length=config['wfmash.segment_length'], @@ -365,6 +368,7 @@ rule seqwish: output: temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") threads: 8 + priority: 100 params: app_path=config['app.path'], seqwish=config['seqwish.params'] @@ -393,6 +397,7 @@ rule gfaffix_on_chr: gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" threads: 1 + priority: 100 params: app_path=config['app.path'] log: @@ -416,6 +421,7 @@ rule odgi_postprocessing: output: gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' threads: 8 + priority: 100 params: app_path=config['app.path'] log: @@ -478,6 +484,7 @@ rule generate_graph_list: output: "data/chrGraphs/graphsList.txt" threads: 1 + priority: 100 run: with open(output[0], "w") as handle: for file in input: @@ -494,6 +501,7 @@ rule graph_squeeze: cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log", time="logs/squeeze/"+config['name']+".squeeze.time.log", threads: 16 + priority: 100 params: app_path=config['app.path'] shell: -- GitLab From 1ccf65891b89b9b983c4fc30d016e94d302a9242 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 10:55:17 +0200 Subject: [PATCH 80/86] Update syri_pruning.py Unifying chromosome sequence record's name to prevent SyRI from erroring out when input chromosomes can be matched with 2 query chromosomes --- scripts/syri_pruning.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/syri_pruning.py b/scripts/syri_pruning.py index 811e2b9..2482cb2 100644 --- a/scripts/syri_pruning.py +++ b/scripts/syri_pruning.py @@ -62,6 +62,7 @@ with gzip.open(args.ref, 'rt') as handle: chrname = record.id.split("#")[-1] chrList.append(chrname) seqDict[name][chrname] = record + seqDict[name][chrname].id = chrname # Parsing fasta files for filename in args.fastafiles: @@ -74,6 +75,7 @@ for filename in args.fastafiles: for record in sequences: chrname = record.id.split("#")[-1] seqDict[name][chrname] = record + seqDict[name][chrname].id = chrname # Getting the list of common chromosomes for hap, chromosomes in seqDict.items(): -- GitLab From ea67debd266fb5851d34c29973443d7f04b71e43 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 12:15:38 +0200 Subject: [PATCH 81/86] Update getSyriFigs.sh Using prunned FASTAs instead of input FASTAs --- scripts/getSyriFigs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 848e2e4..07cdec7 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -84,7 +84,7 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Syri on previous alignment apptainer run $appdir/pan1c-env.sif \ - syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \ + syri -c $wrkdir/$bamFile -r $wrkdir/$(basename ${asmList[i]} .gz) -q $wrkdir/$(basename ${asmList[i + 1]} .gz) -k -F B \ --nc $threads \ --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)." done -- GitLab From 204d3b62b6932029f4664554fbee0a7dfc59984a Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 15:53:30 +0200 Subject: [PATCH 82/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 07cdec7..92b1cdb 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -69,6 +69,11 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do apptainer run $appdir/pan1c-env.sif python scripts/syri_pruning.py \ -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir + echo "\n[Debug::SyRI_script::$hapID] REF" + grep '>' ${asmList[i]} + echo "\n[Debug::SyRI_script::$hapID] QRY" + grep '>' ${asmList[i + 1]} + # Renaming chromosomes with same ids as the reference (Not working) #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) -- GitLab From ba890499fe98c030df6542da90dbe262994796de Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 15:55:57 +0200 Subject: [PATCH 83/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 92b1cdb..972d2dc 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -70,10 +70,10 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir echo "\n[Debug::SyRI_script::$hapID] REF" - grep '>' ${asmList[i]} + grep '>' $wrkdir/$(basename ${asmList[i]} .gz) echo "\n[Debug::SyRI_script::$hapID] QRY" - grep '>' ${asmList[i + 1]} - + grep '>' $wrkdir/$(basename ${asmList[i + 1]} .gz) + # Renaming chromosomes with same ids as the reference (Not working) #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) -- GitLab From 2cad655b30c1917c30bdda9000eaf6cabb9ea1b1 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 15:57:56 +0200 Subject: [PATCH 84/86] Update getSyriFigs.sh --- scripts/getSyriFigs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 972d2dc..e85ea25 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -98,7 +98,7 @@ done # Each line contains 2 columns : fasta filepath and its simpler name echo -e "#files\tname" > $wrkdir/genomes.txt for asm in "${asmList[@]}"; do - echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt + echo -e "$wrkdir/$(basename ${asm} .gz)\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt done # Generating the plotsr command -- GitLab From 059b5f346bece0dcfc40bca951de04c7711d14bb Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Tue, 9 Jul 2024 16:06:07 +0200 Subject: [PATCH 85/86] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 94cb1f6..a86dcef 100644 --- a/Snakefile +++ b/Snakefile @@ -274,7 +274,7 @@ rule SyRI_on_chrInput: output: fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", wrkdir=directory('data/chrInputs/syri/{chromosome}') - threads: workflow.cores + threads: 8 params: app_path=config["app.path"], ref=config['reference'] -- GitLab From 249b23d0fd2492d4d43581c8d9b7a00d986249fc Mon Sep 17 00:00:00 2001 From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr> Date: Thu, 11 Jul 2024 12:15:39 +0200 Subject: [PATCH 86/86] Silented some debug lines --- Snakefile | 2 +- scripts/getSyriFigs.sh | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Snakefile b/Snakefile index a86dcef..71c2bb3 100644 --- a/Snakefile +++ b/Snakefile @@ -299,7 +299,7 @@ rule SyRI_on_chrInput: AllAsmList+=("$(dirname $file)/$asm") done - echo "The ASM Array : ${{AllAsmList[@]}}" + #echo "The ASM Array : ${{AllAsmList[@]}}" bash scripts/getSyriFigs.sh \ -a {params.app_path} \ diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index e85ea25..84f93c2 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -67,12 +67,12 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Prunning fasta files based of common chromosomes apptainer run $appdir/pan1c-env.sif python scripts/syri_pruning.py \ - -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir + -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir -d - echo "\n[Debug::SyRI_script::$hapID] REF" - grep '>' $wrkdir/$(basename ${asmList[i]} .gz) - echo "\n[Debug::SyRI_script::$hapID] QRY" - grep '>' $wrkdir/$(basename ${asmList[i + 1]} .gz) + #echo "\n[Debug::SyRI_script::$hapID] REF" + #grep '>' $wrkdir/$(basename ${asmList[i]} .gz) + #echo "\n[Debug::SyRI_script::$hapID] QRY" + #grep '>' $wrkdir/$(basename ${asmList[i + 1]} .gz) # Renaming chromosomes with same ids as the reference (Not working) #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) -- GitLab