#!/usr/bin/perl # # CLOBB.pl - the 'new improved' clustering algorithm # Usage CLOBB.pl # Requisities - a directory named 'sequences' which # contains fasta files of individual sequences. Other # directories and files are created as needed by the # program. Note cluster name is typically a 3 letter # identifier (e.g. DRC - for danio rerio clusters). # # CLOBB makes a comma delimited list ('master') of the sequences # which is then used to cluster each sequence sequentially. # You may also have to change the location of the # ncbi blast executables in the lines : # # system "blastall -p blastn -d $DATABASE -i $seq_files/$file -G 2 -E 1 -v 100 -F F -o $rootdir/$file.out >> $log"; # system "formatdb -i $DATABASE_LOCATION -p F"; # # CLOBB produces the following output : # EST - a fasta database of the sequences # with cluster assignments - EST00001 -> EST99999 # A directory 'OUT' which contains a logfile detailing the treatment # of each sequence. # A file called 'merge' which contains a list of all the clusters # which have been merged into another cluster. # A file called 'superclusters' which lists related clusters # # use File::stat; $CLUSTERNAME = $ARGV[0]; $DATABASE = $CLUSTERNAME; $DATABASE .= "EST"; $DATABASE_LOCATION = "./$DATABASE"; $| = 1; # make STDOUT print buffer flush immediately $rootdir = "./OUT"; $seq_files = "./sequences"; # Directory to find the sequence files $seq_files1 = "./sequences_done"; # Directory to place used sequence files # Lets set up the directories and relevant files if not # already present $INDNUMBER=0; if(!stat(OUT)) { system("mkdir OUT"); } if(!stat($seq_files1)) { system("mkdir $seq_files1"); } if(!stat(superclusters)) { system("touch superclusters"); } if(!stat(merge)) { system("touch merge"); } if(!stat($DATABASE)) { system("touch $DATABASE"); } else # find the old INDNUMBER from the database { open(fh,"<$DATABASE"); while($line=) { if($line=~/$CLUSTERNAME(\d+)/ && $1 > $INDNUMBER) { $INDNUMBER=$1; $INDNUMBER=~s/^0+//; print "New Indnum is $INDNUMBER\n"; } } close(fh); } # Create the master list for cluster generation if(stat(master)) { system("rm master"); } opendir(FILEDIR, "$seq_files"); open(FH,">>master"); while (defined($file= readdir(FILEDIR))) { if($file!~/^\./) { $size=stat("$seq_files/$file")->size; if($size > 100) { print FH "$file,"; } # Minimum size of file for clustering } } close(FH); closedir(FILEDIR); # open the master file and use it to create a search list open (MASTER, "master"); @masterlist = split(',', ); @searchlist = @masterlist; # Open a log file for the session. Use the date to give each logfile a # distinctive name. Use a "tee" to allow blast information to go to the terminal # as well as to the logfile. Note: Error messages from "blastn" (below) go only # to the log file; # ($min,$hr,$mday,$yr) = (localtime)[1,2,3,5]; $min = "0" . $min if $min < 10; $hr = "0" . $hr if $hr < 10; $mon =(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[(localtime)[4]]; $log = "$rootdir/logfile_$hr:${min}_$mday$mon$yr"; open(LOG, "| tee -a $log"); select(LOG); $| = 1; # Look at the sequence files, one file at a time. # You have to be in the correct source directory to do this. # $j = 0; SEARCH: while($file = shift(@searchlist)) { print LOG "Blasting $file\n"; # You may need to alter the line below to reflect the # whereabouts of the blast executable. system "blastall -p blastn -d $DATABASE -i $seq_files/$file -G 2 -E 1 -v 100 -F F -o $rootdir/$file.out >> $log"; open(OUTFILE,"<$rootdir/$file.out") || die "can't open $rootdir/$file.out\n"; $chimera=0; $e_flag=0; $s_flag=0; $sim=''; $name=''; $k = 0; $kk=0; $i_hit=0; $e_hit=0; ## Overlap variables for reclustering here $over_num=0; $subject_length[$over_num]= 0; $subject_start[$over_num]= 0; $subject_end[$over_num]= 0; $query_start[$over_num]= 0; $query_stop[$over_num]= 0; while () { # When we hit a new HSP we need to deal with the old HSP first # First check the sequence that was read in for low 'Identity' # High E value hits if($s_flag==1) { # The E value score is high enough to warrant # looking for a region of high % identity. If # a stretch of 30 bases with > 95% identity is # found then regard this HSP as a 'pukka' hit. if(/^>/ || /Score/ || /Database/) { $e_hit=0; $s_flag=0; $array=''; $n=0; while ($sim=~/(.)/g) { $array.=$1; $n++; if($n > 30) { $array=~s/.//; $match=0; $no_match=0; while($array=~/(.)/g) { if($1=~/\|/) { $match++; } if($1=~/\s/) { $no_match++; } } $percent=$match/($match+$no_match); if($percent> 0.94) { $e_hit=1; last; } } } } if(/\|/) { chomp; $_=~s/\s{11}//; $sim.=$_; } } # End of section checking high E value HSPs for %identity if((/^>/ || /Score/ || /Database/) && ($i_hit==1 || $e_hit==1)) { # We have a match of sorts now # check to see if seqs overlap without matching # properly. NB If accepted one HSP, must ignore # next HSPs which may be from the same subject. # This is acheived using the chimera flag. # An overlap of 10% of the query/subject length # is regarded as OK (since the start/ends of # reads tends to be low quality. If the overlap # is larger - then number of N's in the overlap # are counted. See count_ns sub routine for criteria # for regarding overlaps as pukka. if($chimera == 0) { $sl_flag=0; if($s_begin < $s_end) # plus strand { $o_length=$q_length-$q_end; $o_error=$q_length/10; if($s_length-$s_end < $o_length) { $o_length=$s_length-$s_end; $o_error=$s_length/10; } if($o_length > $o_error) { $sl_flag=1; count_ns($o_length,$s_end,$q_end,1,1); print("$sl_flag\n"); } $o_length=$q_begin; $o_error=$q_length/10; if($s_begin < $o_length) { $o_length=$s_begin; $o_error=$s_length/10; } if($o_length > $o_error) { $sl_flag=1; count_ns($o_length,$s_begin,$q_begin,-1,-1); print("$sl_flag\n"); } } if($s_begin > $s_end) # minus strand { $o_length=$q_length-$q_end; $o_error=$q_length/10; if($s_end < $o_length) { $o_length=$s_end; $o_error=$s_length/10; } if($o_length > $o_error) { $sl_flag=1; count_ns($o_length,$s_end,$q_end,-1,1); print("$sl_flag\n"); } $o_length=$q_begin; $o_error=$q_length/10; if($s_length-$s_begin < $o_length) { $o_length=$s_length-$s_begin; $o_error=$s_length/10;} if($o_length > $o_error) { $sl_flag=1; count_ns($o_length,$s_begin,$q_begin,1,-1); print("$sl_flag\n"); } } if($sl_flag==1) # Its the middle which overlaps # And it should be placed in a new cluster # Can also tag it to show it has similarity to.. # Reject other matches to the same cluster { print LOG ("$file is similar to $clustername ($subfile)\n"); print LOG ("qb=$q_begin qe=$q_end ql=$q_length\n"); print LOG ("sb=$s_begin se=$s_end sl=$s_length\n"); $chimname[$kk]=$clustername; $kk++; $chimera=1; } if($sl_flag==0) # The overlap looks genuine and should be assigned # to the same cluster { print LOG ("$file appears part of $clustername ($subfile)\n"); $chimera=2; $subject_length[$k]=$s_length; $subject_start[$k]= $s_begin; $subject_end[$k]= $s_end; $query_start[$k]= $q_begin; $query_stop[$k]= $q_end; $clustermatch[$k] = $clustername; $k++; } } $i_hit=0; $e_flag=0; $e_hit=0; } # End of loop dealing with a pukka sequence # This next section tries to identify pukka sequences on # basis of E-value and/or Identity. $clustername=$name; if(/(\d+)\sletters/) { $q_length=$1; } # Length of query # This next section finds and resets a number of variables for each # Subject - NB the OVs in the line below may need changing to # suit your sequences : if(/^>(\w+)/) { $subfile=$1; $chimera=0; } if(/($CLUSTERNAME\d{5,6})/) { $name = $1; } if(/\sLength\s\=\s(\d+)/) { $s_length = $1; } # This next section finds a number of variables for each # HSP to allow determination of overlaps : if(/\sIdentities\s\=\s\d+\/\d+\s\(\d+/) { $q_begin=''; $q_end=''; $s_begin=''; $s_end=''; } if(/Query:\s(\d+)/ && $q_begin == '') { $q_begin=$1; } if(/Sbjct:\s(\d+)/ && $s_begin == '') { $s_begin=$1; } if(/Query:\s\d+\s+.+\s+(\d+)/) { $q_end=$1; } if(/Sbjct:\s\d+\s+.+\s+(\d+)/) { $s_end=$1; } # Flag HSPs which have high e values but below # the identity cutoff to look for regions of # high identity if(/\sExpect\s\=\s(\w.*)/) { $score=$1; if(($score=~/e-(\d+)/ && $1 > $EXP) || $score=~/0\.0$/) { $e_flag=1; } } if(/\sIdentities\s\=\s(\d+)\/(\d+)\s\((\d+)/ && $1 > 30 && $3 < 96 && $e_flag==1) { ################# Do local identity checking ############# ################# But only if Identity wasn't high ############# $s_flag=1; $sim=''; $e_flag=0; } # If the HSP has a > 94% Identity if(/\sIdentities\s\=\s(\d+)\/(\d+)\s\((\d+)/ && $1 > 30 && $3 > 94) { $i_hit=1; $h_length=$1; } } # End of loop searching for pukka sequences close(OUTFILE); $merge_flag=1; if ($k == 0) { # never got any matches $rightstring = $CLUSTERNAME . &index(++$INDNUMBER); } elsif ($k == 1) { # got a single match $rightstring = $clustermatch[0]; if($kk>0) { $pukka_name_flag=1; for($nn=0;$nn<$kk;$nn++) { if($clustermatch[0] eq $chimname[$nn]) { $pukka_name_flag=0; last; } } if($pukka_name_flag==0) { $rightstring = $CLUSTERNAME . &index(++$INDNUMBER); } } } elsif($k > 1) # got more than one match { # First find lowest cluster number $num_clus=1; $cluster_num=10000000; for($n=0;$n<$k;$n++) { $clustermatch[$n]=~/$CLUSTERNAME(\d+)/; if($1 < $cluster_num) { $low_cluster=$n; $cluster_num=$1; } } # Need to find out if the two subject hits overlap # by > 30, if yes then reject the read -> problem read # Could actually treat this as a new cluster - or stick in # cluster with highest identity/E value ? # Otherwise merge the clusters for($n=0;$n<$k;$n++) { for($nn=$n+1;$nn<$k;$nn++) { if($clustermatch[$n] ne $clustermatch[$nn]) { $num_clus++; print ("Thinking about merging - $query_start[$n] $query_stop[$n] $query_start[$nn] $query_stop[$nn]\n"); # Also cases where start1=start2 & end1=end2 if($query_start[$n] == $query_start[$nn] || $query_stop[$n] == $query_stop[$nn]) { $merge_flag=0; } if($query_start[$n] > $query_start[$nn] && $query_start[$n]+30 < $query_stop[$nn]) { $merge_flag=0; } if($query_start[$nn] > $query_start[$n] && $query_start[$nn]+30 < $query_stop[$n]) { $merge_flag=0; } } } # The next section marks a cluster as a problem # since the read matched part of a cluster well # but another part of the same cluster badly # possibly an alternative splice ? # Also these clusters should not therefore be merged if($kk>0) { for($nn=0;$nn<$kk;$nn++) { if($clustermatch[$n] eq $chimname[$nn]) { $chimera=3; $merge_flag=0; } } } } if($merge_flag==0) # This read hits > 1 clusters which # are obviously different clusters # so we'll give this read the same # cluster number as the highest hit. # provided that there weren't any clashes { $rightstring = $CLUSTERNAME . &index(++$INDNUMBER); for($n=0;$n<$k;$n++) { $pukka_name_flag=1; for($nn=0;$nn<$kk;$nn++) { if($clustermatch[$n] eq $chimname[$nn]) { $pukka_name_flag=0; last; } } if($pukka_name_flag==1) { $rightstring = $clustermatch[$n]; last; } else { $clustermatch[$k]=$rightstring; $k++; } } if($pukka_name_flag==0) { $clustermatch[$k]=$rightstring; $k++; } &problem($file); print LOG "***** $file hits more than 1 cluster *****\n"; } elsif($merge_flag==1) # merge the clusters ! { for($n=0;$n<$k;$n++) { if($clustermatch[$n] ne $clustermatch[$low_cluster]) { system("sed 's/$clustermatch[$n]/$clustermatch[$low_cluster]/g' $DATABASE_LOCATION > temp_cluster"); system("mv temp_cluster $DATABASE_LOCATION"); print LOG "Merging $clustermatch[$n] into $clustermatch[$low_cluster]\n"; open(MERGE,">>merge"); print MERGE "$clustermatch[$n] => $clustermatch[$low_cluster]\n"; close(MERGE); } } $rightstring = $clustermatch[$low_cluster]; } if($chimera==3) { # if ($num_clus == 1) { # The same cluster was hit several times # # but was a 'bad' hit in at least one case # $rightstring = $CLUSTERNAME . &index(++$INDNUMBER); # $clustermatch[$k]=$rightstring; $k++; # } # The read assigned the cluster number from above # is added to the superclusters open(CHIM_FILE,"superclusters"); $chim_flag=0; while($line=) { if($line=~/>$clustermatch[$low_cluster]/) { $chim_flag=1; last; } $line=~/>(.+)/; $chim_cluster=$1; if($line=~/$clustermatch[$low_cluster]/) { $chim_flag=2; last; } } if($chim_flag==2) { close(CHIM_FILE); open(CHIM_FILE,"superclusters"); $chim_flag=0; while($line=) { if($line=~/>$chim_cluster/) { $chim_flag=1; last; } } } if($chim_flag==1) ### This cluster already recorded as part of a ### supercluster { $line_no=$.; $line=; $old_line=$line; chomp($line); $new_line=$line; for($n=0;$n<$k;$n++) { if($line!~/$clustermatch[$n]/) { if($line!~/$clustermatch[$n]/) { $new_line.=$clustermatch[$n]; $new_line.=' '; } $line=$new_line; } } close(CHIM_FILE); open(CHIM_FILE,"superclusters"); open(TMP_CHIM,"> tmpchim"); select(TMP_CHIM); while() { if($. == $line_no+1) { print TMP_CHIM "$new_line\n"; } else { print TMP_CHIM $_; } } close(TMP_CHIM); close(CHIM_FILE); system("mv -f tmpchim superclusters"); } if($chim_flag==0) { $line=''; $new_line=''; for($n=0;$n<$k;$n++) { if($line!~/$clustermatch[$n]/) { $new_line.=$clustermatch[$n]; $new_line.=' '; } $line=$new_line; } close(CHIM_FILE); system("echo '>$clustermatch[$low_cluster]\n$line' >> superclusters"); print LOG "***** $clustermatch[$low_cluster] appears to be a supercluster *****\n"; } } } # Assign the read to a cluster old or new # Also need to change location of formatdb executable &newfile($file, $rightstring); print LOG "$file assigned $rightstring\n"; system "cat $seq_files1/$file >> $DATABASE_LOCATION"; system "echo '\n' >> $DATABASE_LOCATION"; system "formatdb -i $DATABASE_LOCATION -p F"; $j++; } print "the current $CLUSTERNAME is $INDNUMBER"; ################################ subroutines ################################ # Put zeros in front of the index to make it four digits long. # Can add an extra line to enable clusters > 100000 sub index{ local($i) = @_; if ($i < 10) { $index = "0000$i"; } elsif ($i < 100) { $index = "000$i"; } elsif ($i < 1000) { $index = "00$i"; } elsif ($i < 10000) { $index = "0$i"; } elsif ($i < 100000) { $index = "$i";} else { die "index out of range\n"; } } # takes a file and copies it and its "out" file to the error directory sub problem { ($badfile) = @_; @badarray = split('/', $badfile); $badroot = pop( @badarray ); $badout = "$rootdir/$badroot.out"; system "cp $badout $rootdir/$CLUSTERNAMEerror"; print LOG "Copying problem file $file to error directory\n"; } # This is redundant but could be added to seperate different # types of problem reads sub problemchim { ($badfile) = @_; @badarray = split('/', $badfile); $badroot = pop( @badarray ); $badout = "$rootdir/$badroot.out"; system "mv $badfile $badout $rootdir/$CLUSTERNAMEchim"; print LOG "Moving problem file $file to Chim directory\n"; } sub newfile { ($file, $newstring) = @_; system("mv $seq_files/$file $seq_files1/$file.old"); open(OLDFILE,"$seq_files1/$file.old"); open(FILE,">$seq_files1/$file"); while() { print FILE unless /^>/; if (/^>/) { chop($original = $_); print FILE "$original $newstring\n"; } } close(OLDFILE); close(FILE); system("rm $rootdir/$file.out"); } sub count_ns { my $n_max=shift; my $sfile_start=shift; my $qfile_start=shift; my $s_add=shift; my $q_add=shift; print"$n_max $sfile_start $qfile_start $s_add $q_add\n"; open(SUBJECT_FILE,"<$seq_files/$subfile"); $n_seq=''; $n_num=0; while($line=) { chop$line; if(substr($line,0,1) ne ">") { $n_seq.=$line; } } close(SUBJECT_FILE); @seq_array=split('',$n_seq); for($n=0;$n<$n_max;$n++) { if($seq_array[$sfile_start+($n*$s_add)] eq "N") { $n_num++; } } if($n_num/$n_max > 0.05) { $sl_flag=0; print LOG "$subfile Overlap N's high ($n_num N's)\n"; } else { open(QUERY_FILE,"<$seq_files/$file"); $n_seq=''; $n_num=0; while($line=) { chop$line; if(substr($line,0,1) ne ">") { $n_seq.=$line; } } @seq_array=split('',$n_seq); for($n=0;$n<$n_max;$n++) { if($seq_array[$qfile_start+($n*$q_add)] eq "N") { $n_num++; } } if($n_num/$n_max > 0.05) { $sl_flag=0; print LOG "$file Overlap N's high ($n_num N's)\n"; } close(QUERY_FILE); } } =head1 NAME CLOBB.pl - A program for clustering sequences on the basis of BLAST similarity =head1 SYNOPSIS CLOBB.pl =head1 DESCRIPTION This program takes a set of DNA sequences and clusters them into groups which putatively derive from the same gene. In order to operate, the user must have NCBI's blastall executable in their path. The output is a blastable fasta file named EST which contains a list of the sequences and a cluster identifier EST00001 -> EST99999. A number of other files and directories are also created. =head1 HOW IT WORKS The program works by performing a BLAST for each sequence against the growing cluster database. The BLAST output is then examined for High Scoring Pairs (HSPs) from the BLAST output which demonstrate near identical regions of sequence similarity (>=95% sequence identity over >30 bases - these figures are modifiable in the source code to increase/decrease stringency) between the traget sequence and the growing cluster database. These matches are classified as type I matches. The next stage of the process goes through the list of type I matches to identify any problems associated with the match. This is achieved by parsing the beginning and end positions of the query and subject sequences from the blast output. If these positions overlap beyond the HSP by more than 30 bases (i.e. the HSP does not extend through the full overlap of the sequences), a further check is performed to ensure that this is not due to the presence of poor quality sequence (determined by the number of bases assigned `N' in the overlap regions). Type I matches which do not have high quality overlaps of more than 30 bases beyond the HSP are designated as type II matches. Other type I matches which possess high quality overlaps of greater than 30 bases which are not part of a HSP are designated as type III matches. The next stage of cluster assignment then involves checking through the lists of type II and type III matches to ensure that no conflicts arise. Given a cluster in which some members are type II matches, if there are other members of the same cluster which have been designated as type III matches, then this indicates that the query sequence matches some but not all members of a cluster and is therefore assigned a new cluster number. The inclusion of this feature in the algorithm prevents the rapid expansion of chimeric clusters and can result in a splitting of related sequences into many different (related) clusters. However, when such events occur, the program catalogues the clusters involved, identifying them as `similar to' the type III match for subsequent post-process analysis (typically performed by manual curation). Another complication occurs when two or more type II matches arise from different clusters. Firstly the blast output is reanalysed to determine whether the HSPs of the matches occur in overlapping regions. If they do not, the query effectively links the clusters and they are merged into the cluster with the lowest index - a separate note is recorded to indicate that such a merge operation has occurred. If they do overlap, this may indicate that they are either alternatively spliced variants of one gene or closely related members of a gene family, and the query sequence is assigned the cluster number of the type II match with which it had the highest blast score, providing that said cluster did not contain a type III match, and an annotation added to indicate that these clusters may be members of a `Supercluster'. Once the query sequence has been assigned to a cluster, it is added to the growing cluster database which is then reformatted to allow the next search. =head2 Created files/Directories =over 4 =item * supercluster This file contains a list of clusters, members of which were found to be similar to each other, however, other members of the cluster where found to be bad matches =item * merge This file contains a list of clusters which have been merged to form a single cluster. This occurs when a sequence matches well with two clusters which do not overlap. =item * master A file containing a comma delimited list of the sequences. =item * sequences_done A directory listing all sequences which have been processed by CLOBB. Two sequences are created for each original, the original sequence is renamed as .old, the new sequence file has the assigned cluster id annotated to its header. =item * OUT A directory containing a logfile detailing the CLOBB process for each sequence. =back =head1 AUTHOR John Parkinson (john.parkinson@ed.ac.uk) =head1 REFERENCE Parkinson J, Guiliano DB, Blaxter M. Making sense of EST sequences by CLOBBing them. BMC Bioinformatics. 2002 Oct 25;3(1):31. =head1 COPYRIGHT This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO perl(1). =cut