pySeqRNA
About Tutorial Download Help

Heat and drought induced transcriptomic changes in barley varieties with contrasting stress response phenotypes¶

Load All the Required Modules¶

In [ ]:
import matplotlib.pyplot as plt
  from matplotlib.transforms import Bbox
  from pyseqrna import arg_parser
  from pyseqrna import pyseqrna_utils as pu
  from pyseqrna import quality_check as qc
  from pyseqrna import quality_trimming as qt
  from pyseqrna import  aligners as al
  from pyseqrna import pyseqrna_stats as ps
  from pyseqrna import quantification as quant
  from pyseqrna import differential_expression as de
  from pyseqrna import pyseqrna_plots as pp
  from pyseqrna import ribosomal as ribo
  from pyseqrna import clustering as cl
  from pyseqrna import multimapped_groups as mmg
  from pyseqrna.gene_ontology import GeneOntology
  from pyseqrna.pathway import Pathway
  from pyseqrna.normalize_counts import Normalization
  from pyseqrna import report
  
  import math
  import pyseqrna.version
  import pandas as pd
  import numpy as np
  import time
  import os, sys
  
  from waiting import wait
  

Initialize a Logger¶

In [ ]:
log = pu.PyseqrnaLogger(mode="w", log="analysis")
  
  log.info("Analysis started")
  

Options¶

In [ ]:
slurm = True
  memory = 100
  threads = 30
  paired= False
  fold = 2
  fdr = 0.05
  species='hvulgare'
  speciestype = 'plants'
  
In [ ]:
outdir = pu.make_directory("pyseqrna_paper_results")
  

Genome and Gene Feature File¶

In [ ]:
reference_genome="./Hordeum_vulgare.MorexV3_pseudomolecules_assembly.dna.toplevel.fa"
  feature_file = "./Hordeum_vulgare.MorexV3_pseudomolecules_assembly.52.gff3"
  
In [ ]:
combination = ["GA-GE", "GB-GF", "GC-GG", "GD-GH", "GI-GM", "GJ-GN", "GK-GO", "GL-GP", "GQ-GU", "GR-GV", "GS-GW", "GT-GX",
                 "OA-OE", "OB-OF", "OC-OG", "OD-OH", "OI-OM", "OJ-ON", "OK-OO", "OL-OP", "OQ-OU", "OR-OV", "OS-OW", "OT-OX"]
  

Read Input Sample File¶

In [ ]:
input_data = pu.read_input_file(infile="targets_pyseqrna.txt", inpath="./input/")
  
In [ ]:
samples = input_data['samples']
  targets = input_data['targets']
  combination = input_data['combinations']
  

Check FASTQ Files Quality¶

In [ ]:
qualitydir = pu.make_directory(os.path.join(outdir, "1_Quality"))
      
  jobid, fastqcout = qc.fastqcRun(sampleDict=samples,outDir=qualitydir, slurm=slurm, mem=memory, cpu=threads, pairedEND=paired)
  
  for job in jobid:
      wait(lambda: pu.check_status(job), waiting_for="quality to finish")
      log.info(f"Quality check completed for job {job}")
  
  log.info("Read quality check completed succesfully")
  

Trim FASTQ Files¶

In [ ]:
outtrim, jobidt = qt.trim_galoreRun(sampleDict=samples,slurm=slurm,mem=memory,cpu=threads, outDir=qualitydir,paired=paired)
  
  for job in jobidt:
      wait(lambda: pu.check_status(job), waiting_for="trimming to finish")
      log.info(f"Trimming completed for job {job}")
  
  log.info("Read trimming completed succesfully")
  

Align Reads on Reference Genome¶

In [ ]:
log.info("Starting Alignment process")
  
  aligndir = pu.make_directory(os.path.join(outdir, "2_Alignment"))
  
In [ ]:
aligner= al.STAR_Aligner(genome=reference_genome,  slurm=slurm,  outDir=aligndir)
  
  log.info("Genome indexing started")
  
  jobida = aligner.build_index(mem=memory,cpu=threads)
  
  wait(lambda: pu.check_status(jobida), waiting_for="genome indexing to finish")
  log.info("Genome index completed succesfully")
  
  if aligner.check_index():
      log.info("Genome index is valid")
      
  
  outalign, jobalign = aligner.run_Alignment(outtrim, pairedEND=paired, mem=memory, cpu=threads)
  
  for job in jobalign:
      wait(lambda: pu.check_status(job), waiting_for="alignment to finish")
  
      log.info(f"Alignment completed for job {job}")
  
  log.info("Read alignment completed succesfully")
  
In [ ]:
align_stat = ps.align_stats(samples,outtrim, outalign,pairedEND=paired)
  
  align_stat.to_excel(aligndir+"/alignment_statistics.xlsx", index=False)
  
  log.info(f"alignment stats completed and written to {aligndir}/alignment_statistics.xlsx")
  

Feature counts¶

In [ ]:
log.info("Feature Count from aligned reads started")
  
  quantdir = pu.make_directory(os.path.join(outdir, "3_Quantification"))
  
  fjob = quant.featureCount(gff=feature_file, bamDict=outalign, outDir=quantdir)
  
  log.info("feature counts completed and written in %s/Raw_Counts.xlsx",quantdir)
  

Count Multi-mapped Gene Groups¶

In [ ]:
log.info("Counting multimapped read groups")
      
  mmg_count = mmg.countMMG(sampleDict=samples,bamDict=outalign,gff=feature_file, minCount=100, percentSample=0.5)
  
  mmg_count.to_excel(os.path.join('pyseqrna_paper_results/3_Quantification',"Raw_MMGcounts.xlsx" ),index=False)
  
  log.info("counting of multimapped read group finished")
  

Differential Gene Expression¶

In [ ]:
diffdir = pu.make_directory(os.path.join(outdir, "4_Differential_Expression"))
  
  count=pd.read_excel(os.path.join(quantdir,"Raw_Counts.xlsx"))
  
  result = de.runDESeq2(countDF=count,targetFile=targets,design='sample', combination=combination, subset=False)
  
  log.info("Differential expression analysis completed")
  
  ge = de.Gene_Description(species=species, type=speciestype, degFile=result, filtered=False)
  
  results = ge.add_names()
  
  results.to_excel(os.path.join(diffdir,"All_gene_expression.xlsx"), index=False)
  
  log.info(f"Filtering differential expressed genes based on logFC {fold} and FDR {fdr}")
  
  filtered_DEG= de.degFilter(degDF=results, CompareList=combination,FDR=fdr, FOLD=fold, extraColumns=True)
  
In [ ]:
log.info("filtering DEGs completed ")
  
  log.info("writting filter DEGs combination wise to excel sheets")
  # write up and down filtered genes together
  wa = pd.ExcelWriter(os.path.join(diffdir,"Filtered_DEGs.xlsx"))
  
  for key, value in filtered_DEG['filtered'].items():
      value.to_excel(wa,sheet_name=key, index=False)
      wa.save()
  wa.close()
  
  filtered_DEG['plot'].savefig(os.path.join(diffdir,"Filtered_DEG.png"),dpi=300, bbox_inches='tight')
  

Create Heatmap¶

In [ ]:
plotdir = pu.make_directory(os.path.join(outdir, "5_Visualization"))
  
In [ ]:
log.info("Creating heatmap of top 50 DEGs")
  
  heatmap, ax = pp.plotHeatmap(result,combination,num=50, type='degs')
  
  heatmap.savefig(os.path.join(plotdir,f"Heatmap_top50.png"), bbox_inches='tight')
  

Extract Differantial Gene IDs for Functional Annotation¶

In [ ]:
genesdir = pu.getGenes(os.path.join(diffdir,"Filtered_DEGs.xlsx"),combinations=combination, outDir=diffdir)
  
In [ ]:
annodir = pu.make_directory(os.path.join(outdir, "6_Functional_Annotation"))
  

Gene Ontology¶

In [ ]:
outgo = pu.make_directory(os.path.join(annodir,"Gene_Ontology"))
  
  gofiles = pu.make_directory(os.path.join(outgo,"GO_Files"))
  
  goplots = pu.make_directory(os.path.join(outgo,"GO_Plots"))
  
  go = GeneOntology(species=species, type=speciestype, keyType='ensembl')
  
  
  for c in combination:
  
      file_deg = f"{genesdir}/{c}.txt"
  
      ontology_results = go.enrichGO(file=file_deg)
  
      if ontology_results != "No Gene Ontology":
  
          go_results = ge.add_names_annotation(ontology_results['result'])
  
          go_results.to_excel(f"{gofiles}/{c}_gene_ontology.xlsx", index=False)
  
          ontology_results['plot'].savefig(f"{goplots}/{c}_go_dotplot.png", bbox_inches='tight')
          plt.close()
      else:
          log.info(f"No ontology found in {c}")
  
In [ ]:
annodir = os.path.join(outdir, "6_Functional_Annotation")
  

KEGG Pathway Analysis¶

In [ ]:
outkegg = pu.make_directory(os.path.join(annodir,"KEGG_Pathway"))
  
  keggfiles = pu.make_directory(os.path.join(outkegg,"KEGG_Files"))
  
  keggplots = pu.make_directory(os.path.join(outkegg,"KEGG_Plots"))
  
  pt = Pathway(species=species, keyType='ensembl')
  
  ge = de.Gene_Description(species=species, type=speciestype, filtered=False)
  
  for c in combination:
  
      file_deg = f"{genesdir}/{c}.txt"
  
      kegg_results = pt.enrichKEGG(file_deg)
  
      if kegg_results != "No Pathway":
          
          k_result = ge.add_names_annotation(kegg_results['result'])
  
          k_result.to_excel(os.path.join(keggfiles, f"{c}_kegg.xlsx"), index=False)
  
          kegg_results['plot'].savefig( f"{keggplots}/{c}_kegg_dotplot.png", bbox_inches='tight')
  
          plt.close()
  
      else:
          log.info(f"No pathway found in {c}")
  

© 2022