1. Fastqc 로 퀄리티 확인
https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
위 홈페이지에서 fastqc를 다운로드 받은후,
"fastqc 샘플이름"
를 입력하면 아래와 같은 Read퀄리티 보고서를 볼 수 있다.
2. Trimmomatic
http://www.usadellab.org/cms/?page=trimmomatic
기본 옵션을 이용하여 돌려줄 때는 아래와 같습니다.
java -jar trimmomatic-0.39.jar PE input_forward.fq.gz input_reverse.fq.gz output_forward_paired.fq.gz output_forward_unpaired.fq.gz output_reverse_paired.fq.gz output_reverse_unpaired.fq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
아래는 여러 샘플을 돌릴때 쓰는 파이썬 스크립트입니다.
import sys,re,os,string,glob ################################# # main ################################# #Step 1: Trimmomatic to remove adapter sequences (in order to generate clean read) my_path = "/disk3/lant/" data_path = "1.Raw_data" os.chdir(my_path + data_path) file = glob.glob("*R1.fastq.gz") os.chdir(my_path + '2.Trimmomatic') for id_left in file: id_right = id_left.replace("R1", "R2") id_log = id_left.replace(".R1.fastq.gz", ".log") id_left_output_paired = id_left.replace("R1.fastq.gz", "R1.paired.fastq.gz") id_right_output_paired = id_right.replace("R2.fastq.gz", "R2.paired.fastq.gz") id_left_output_unpaired = id_left.replace("R1.fastq.gz", "R1.unpaired.fastq.gz") id_right_output_unpaired = id_right.replace("R2.fastq.gz", "R2.unpaired.fastq.gz") cmd = 'java -jar /home/Program/Trimmomatic-0.32/trimmomatic-0.32.jar PE -threads 16 -phred33 ../%s/%s ../%s/%s %s %s %s %s ILLUMINACLIP:/home/Program/Trimmomatic-0.32/adapters/TruSeq3-PE-2.fa:2:30:10 MINLEN:75 2> %s'%(data_path, id_left, data_path, id_right, id_left_output_paired, id_left_output_unpaired, id_right_output_paired, id_right_output_unpaired, id_log) print(cmd) #os.system(cmd)
3. CutAdapt
https://cutadapt.readthedocs.io/en/stable/
CutAdapt로 여러 샘플을 돌릴때 쓰는 파이썬 스크립트입니다.
import glob,os RawPath = "./1-2.Raw_data_Naming/" OutputPath = "./2.Trim/" for sFile in glob.glob(RawPath+"*_R1.fastq"): sFastqName = os.path.split(sFile)[1] print(sFastqName) Left = sFastqName Right = sFastqName.replace("_R1.fastq","_R2.fastq") Log = sFastqName.replace("_R1.fastq",".log") CMD = "~/.local/bin/cutadapt -m 10 -q 30,30 -b a{300} -b t{300} -B a{300} -B t{300} -o %s%s -p %s%s %s%s %s%s > %s%s"%(OutputPath,Left,OutputPath,Right,RawPath,Left,RawPath,Right,OutputPath,Log) print(CMD) os.system(CMD)