BioPHP - Random sequences (original)
Original code submitted by josebaCode bellow is covered by GNU GPL v2 license.
Description
Last change: 2006/10/10 15:20 | Recent ChangesA tool to generate random DNA or protein sequences. It allows to shuffle a sequence, generate a random sequence by using frequencies in a sample sequences, or to generate random sequences by providing frequencies of nucleotides or aminoacids.
Code
Last change: 2006/10/10 15:20 | Download original | Recent Changes | Original code and<?php
// author Joseba Bikandi
// license GNU GPL v2
// source code available at biophp.org
// the code in the top will manipulated the input sequence
// in the middle of the file is located the form
// in the botton are located the functions used in this script
//############################################################################
//################# lets manipulated the sequence #################
//############################################################################
if($_POST){
// get procedure
$procedure=$_POST["procedure"];
// let's respond
if ($procedure=="fromseq"){
// get the sequence
$seq=strtoupper($_POST["seq"]);
// remove non coding from sequence (works for DNA and protein)
$seq=preg_replace("/\W|[^ABCDEFGHIKLMNPQRSTVWXY]|\d/","",$seq);
// get length of output sequence
$length1=$_POST["length1"];
if($length1){
// remove from sequence characters different to ACGT.
$seqACGT=preg_replace("/[^ACGT]/","",$seq);
// The sequence is DNA if A+C+G+T>70% (so, if $seqACGT is long enought)
if (strlen($seqACGT)>strlen($seq)*0.7){
// The sequence is DNA
// get the frequencies for each nucleotide
$a=0;$c=0;$g=0;$t=0;
$a=substr_count($seq,"A");
$c=substr_count($seq,"C");
$g=substr_count($seq,"G");
$t=substr_count($seq,"T");
$acgt=$a+$c+$g+$t;
// Get number of ocurrences per each nucleotide for a seq with length=$length1
$a2=round ($a*$length1/$acgt);
$c2=round ($c*$length1/$acgt);
$g2=round ($g*$length1/$acgt);
$t2=round ($t*$length1/$acgt);
// get randomized sequence
$result=randon_DNA($a2,$c2,$g2,$t2);
}else{
// The sequence is protein
// get the frequencies for each aminoacid
$A=0;$C=0;$D=0;$E=0;$F=0;$G=0;$H=0;$I=0;$K=0;$L=0;
$M=0;$N=0;$P=0;$Q=0;$R=0;$S=0;$T=0;$V=0;$W=0;$Y=0;
$A=substr_count($seq,"A");
$C=substr_count($seq,"C");
$D=substr_count($seq,"D");
$E=substr_count($seq,"E");
$F=substr_count($seq,"F");
$G=substr_count($seq,"G");
$H=substr_count($seq,"H");
$I=substr_count($seq,"I");
$K=substr_count($seq,"K");
$L=substr_count($seq,"L");
$M=substr_count($seq,"M");
$N=substr_count($seq,"N");
$P=substr_count($seq,"P");
$Q=substr_count($seq,"Q");
$R=substr_count($seq,"R");
$S=substr_count($seq,"S");
$T=substr_count($seq,"T");
$V=substr_count($seq,"V");
$W=substr_count($seq,"W");
$Y=substr_count($seq,"Y");
$ACDEFGHIKLMNPGRSTVWY=$A+$C+$D+$E+$F+$G+$H+$I+$K+$L+$M+$N+$P+$Q+$R+$S+$T+$V+$W+$Y;
// Get number of ocurrences per each aminoacid for a seq with length=$length1
$A2=round ($A*$length1/$ACDEFGHIKLMNPGRSTVWY);
$C2=round ($C*$length1/$ACDEFGHIKLMNPGRSTVWY);
$D2=round ($D*$length1/$ACDEFGHIKLMNPGRSTVWY);
$E2=round ($E*$length1/$ACDEFGHIKLMNPGRSTVWY);
$F2=round ($F*$length1/$ACDEFGHIKLMNPGRSTVWY);
$G2=round ($G*$length1/$ACDEFGHIKLMNPGRSTVWY);
$H2=round ($H*$length1/$ACDEFGHIKLMNPGRSTVWY);
$I2=round ($I*$length1/$ACDEFGHIKLMNPGRSTVWY);
$K2=round ($K*$length1/$ACDEFGHIKLMNPGRSTVWY);
$L2=round ($L*$length1/$ACDEFGHIKLMNPGRSTVWY);
$M2=round ($M*$length1/$ACDEFGHIKLMNPGRSTVWY);
$N2=round ($N*$length1/$ACDEFGHIKLMNPGRSTVWY);
$P2=round ($P*$length1/$ACDEFGHIKLMNPGRSTVWY);
$Q2=round ($Q*$length1/$ACDEFGHIKLMNPGRSTVWY);
$R2=round ($R*$length1/$ACDEFGHIKLMNPGRSTVWY);
$S2=round ($S*$length1/$ACDEFGHIKLMNPGRSTVWY);
$T2=round ($T*$length1/$ACDEFGHIKLMNPGRSTVWY);
$V2=round ($V*$length1/$ACDEFGHIKLMNPGRSTVWY);
$W2=round ($W*$length1/$ACDEFGHIKLMNPGRSTVWY);
$Y2=round ($Y*$length1/$ACDEFGHIKLMNPGRSTVWY);
// get randomized sequence
$result=randon_prot($A2,$C2,$D2,$E2,$F2,$G2,$H2,$I2,$K2,$L2,$M2,$N2,$P2,$Q2,$R2,$S2,$T2,$V2,$W2,$Y2);
}
}else{
// just shuffle the sequence when length is not provided
$result=str_shuffle($seq);
}
}
if ($procedure=="fromACGT"){
// get the frequencies for each nucleotide
$a=$_POST["DnaA"];
$c=$_POST["DnaC"];
$g=$_POST["DnaG"];
$t=$_POST["DnaT"];
// get length of output sequence
$length2=$_POST["length2"];
// Get number of ocurrences per each nucleotide
if ($length2){
// in case length is specified
$acgt=$a+$c+$g+$t;
$a2=round ($a*$length2/$acgt);
$c2=round ($c*$length2/$acgt);
$g2=round ($g*$length2/$acgt);
$t2=round ($t*$length2/$acgt);
}else{
// in case length is not specified
$a2=round ($a);
$c2=round ($c);
$g2=round ($g);
$t2=round ($t);
}
// get randomized sequence
$result=randon_DNA($a2,$c2,$g2,$t2);
$seq=$result;
}
if ($procedure=="fromAA"){
// get the frequencies for each aminoacid
$A=$_POST["A"];
$C=$_POST["C"];
$D=$_POST["D"];
$E=$_POST["E"];
$F=$_POST["F"];
$G=$_POST["G"];
$H=$_POST["H"];
$I=$_POST["I"];
$K=$_POST["K"];
$L=$_POST["L"];
$M=$_POST["M"];
$N=$_POST["N"];
$P=$_POST["P"];
$Q=$_POST["Q"];
$R=$_POST["R"];
$S=$_POST["S"];
$T=$_POST["T"];
$V=$_POST["V"];
$W=$_POST["W"];
$Y=$_POST["Y"];
// get length of output sequence
$length3=$_POST["length3"];
// Get number of ocurrences per each aminoacid
if ($length3){
// in case length is specified
$ACDEFGHIKLMNPGRSTVWY=$A+$C+$D+$E+$F+$G+$H+$I+$K+$L+$M+$N+$P+$Q+$R+$S+$T+$V+$W+$Y;
$A2=round ($A*$length3/$ACDEFGHIKLMNPGRSTVWY);
$C2=round ($C*$length3/$ACDEFGHIKLMNPGRSTVWY);
$D2=round ($D*$length3/$ACDEFGHIKLMNPGRSTVWY);
$E2=round ($E*$length3/$ACDEFGHIKLMNPGRSTVWY);
$F2=round ($F*$length3/$ACDEFGHIKLMNPGRSTVWY);
$G2=round ($G*$length3/$ACDEFGHIKLMNPGRSTVWY);
$H2=round ($H*$length3/$ACDEFGHIKLMNPGRSTVWY);
$I2=round ($I*$length3/$ACDEFGHIKLMNPGRSTVWY);
$K2=round ($K*$length3/$ACDEFGHIKLMNPGRSTVWY);
$L2=round ($L*$length3/$ACDEFGHIKLMNPGRSTVWY);
$M2=round ($M*$length3/$ACDEFGHIKLMNPGRSTVWY);
$N2=round ($N*$length3/$ACDEFGHIKLMNPGRSTVWY);
$P2=round ($P*$length3/$ACDEFGHIKLMNPGRSTVWY);
$Q2=round ($Q*$length3/$ACDEFGHIKLMNPGRSTVWY);
$R2=round ($R*$length3/$ACDEFGHIKLMNPGRSTVWY);
$S2=round ($S*$length3/$ACDEFGHIKLMNPGRSTVWY);
$T2=round ($T*$length3/$ACDEFGHIKLMNPGRSTVWY);
$V2=round ($V*$length3/$ACDEFGHIKLMNPGRSTVWY);
$W2=round ($W*$length3/$ACDEFGHIKLMNPGRSTVWY);
$Y2=round ($Y*$length3/$ACDEFGHIKLMNPGRSTVWY);
}else{
// in case length is not specified
$A2=round ($A);
$C2=round ($C);
$D2=round ($D);
$E2=round ($E);
$F2=round ($F);
$G2=round ($G);
$H2=round ($H);
$I2=round ($I);
$K2=round ($K);
$L2=round ($L);
$M2=round ($M);
$N2=round ($N);
$P2=round ($P);
$Q2=round ($Q);
$R2=round ($R);
$S2=round ($S);
$T2=round ($T);
$V2=round ($V);
$W2=round ($W);
$Y2=round ($Y);
}
// get randomized sequence
$result=randon_prot($A2,$C2,$D2,$E2,$F2,$G2,$H2,$I2,$K2,$L2,$M2,$N2,$P2,$Q2,$R2,$S2,$T2,$V2,$W2,$Y2);
$seq=$result;
}
// 70 characters per line before output
$seq = chunk_split($seq, 70);
$result = chunk_split($result, 70);
}else{
$seq="";
$result="";
$procedure="fromseq";
}
//############################################################################
//################# we have already manipulated the sequence #################
//############################# bellow is the form ###########################
//############################################################################
?>
<html>
<head>
<title>Random sequences</title>
</head>
<body bgcolor=FFFFFF>
<center>
<form method='post' action="<? print $_SERVER["PHP_SELF"]; ?>">
<H2>Random sequences</H2>
<table cellpadding=5 cellspacing=0 width=650 border=0>
<tr><td align=center bgcolor=FFDDDD>Select<br>method
</td><td align=center bgcolor=FFDDDD>
Parameters
</td></tr>
<tr><td valign=top bgcolor=DDFFFF>
<input type=radio name=procedure value=fromseq<?php if ($procedure=="fromseq"){print " CHECKED";} ?>>
</td><td bgcolor=DDFFFF>
<B>Row sequence to be randomized <?php if($seq){print "($seqlen bp)";} ?>:</B>
<br><font size=-2>DNA or protein nature of the sequence will be automatically detected. Non coding characters are removed by default.</font>
<br><textarea name='seq' rows='4' cols='80'><?php print $seq; ?></textarea>
<BR>Generate a random sequence of length <input type=text name=length1 size=5<?php if ($length1){print " value=$length1";} ?>> with composition above
<br><font size=-2>If length is blank, the characters above will be shuffled.</font>
</td></tr>
<tr><td valign=top bgcolor=66FFFF>
<input type=radio name=procedure value=fromACGT<?php if ($procedure=="fromACGT"){print " CHECKED";} ?>>
</td><td bgcolor=66FFFF>
Generate random DNA sequence of length <input type=text name=length2 size=5<?php if ($length2){print " value=$length2";} ?>> and composition bellow:
<br>A: <input type=text name=DnaA size=5 value=<?php if ($a){print $a;}else{print "29.5";} ?>>
C: <input type=text name=DnaC size=5 value=<?php if ($c){print $c;}else{print "20.5";} ?>>
G: <input type=text name=DnaG size=5 value=<?php if ($g){print $g;}else{print "20.5";} ?>>
T: <input type=text name=DnaT size=5 value=<?php if ($t){print $t;}else{print "29.5";} ?>>
</td></tr>
<tr><td valign=top bgcolor=AAFFFF>
<input type=radio name=procedure value=fromAA<?php if ($procedure=="fromAA"){print " CHECKED";} ?>>
</td><td bgcolor=AAFFFF>
Generate random protein sequence of length <input type=text name=length3 size=5<?php if ($length3){print " value=$length3";} ?>> and composition bellow:
<br>A: <input type=text name=A size=5 value=<?php if ($A){print $A.">";}else{print "7.174> ‰";} ?>
C: <input type=text name=C size=5 value=<?php if ($C){print $C.">";}else{print "2.395> ‰";} ?>
D: <input type=text name=D size=5 value=<?php if ($D){print $D.">";}else{print "4.872> ‰";} ?>
E: <input type=text name=E size=5 value=<?php if ($E){print $E.">";}else{print "6.662> ‰";} ?>
F: <input type=text name=F size=5 value=<?php if ($F){print $F.">";}else{print "3.624> ‰";} ?>
<br>G: <input type=text name=G size=5 value=<?php if ($G){print $G.">";}else{print "7.532> ‰";} ?>
H: <input type=text name=H size=5 value=<?php if ($H){print $H.">";}else{print "2.366> ‰";} ?>
I: <input type=text name=I size=5 value=<?php if ($I){print $I.">";}else{print "4.374> ‰";} ?>
K: <input type=text name=K size=5 value=<?php if ($K){print $K.">";}else{print "5.635> ‰";} ?>
L: <input type=text name=L size=5 value=<?php if ($L){print $L.">";}else{print "9.412> ‰";} ?>
<br>M: <input type=text name=M size=5 value=<?php if ($M){print $M.">";}else{print "2.196> ‰";} ?>
N: <input type=text name=N size=5 value=<?php if ($N){print $N.">";}else{print "3.789> ‰";} ?>
P: <input type=text name=P size=5 value=<?php if ($P){print $P.">";}else{print "6.294> ‰";} ?>
Q: <input type=text name=Q size=5 value=<?php if ($Q){print $Q.">";}else{print "4.509> ‰";} ?>
R: <input type=text name=R size=5 value=<?php if ($R){print $R.">";}else{print "5.607> ‰";} ?>
<br>S: <input type=text name=S size=5 value=<?php if ($S){print $S.">";}else{print "7.527> ‰";} ?>
T: <input type=text name=T size=5 value=<?php if ($T){print $T.">";}else{print "5.685> ‰";} ?>
V: <input type=text name=V size=5 value=<?php if ($V){print $V.">";}else{print "6.026> ‰";} ?>
W: <input type=text name=W size=5 value=<?php if ($W){print $W.">";}else{print "1.480> ‰";} ?>
Y: <input type=text name=Y size=5 value=<?php if ($Y){print $Y.">";}else{print "2.840> ‰";} ?>
</td></tr>
<tr><td>
</td><td>
<input type='submit' value='Sutmit'> <a href="<? print $_SERVER["PHP_SELF"]; ?>">Start</a>
</td></tr>
</table>
</form>
<table cellpadding=5 width=650 border=0>
<tr><td align=center>
<pre>
<?php
if($result!=""){
print "<textarea rows=10 cols=80>$result</textarea>";
}
?></pre>
</td></tr>
<tr><td>
<b>NOTES</b>:
<br><a href=http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=retrieve&db=pubmed&list_uids=7957164&dopt=abstract>NC-UIBMB</a>
codes are used as a reference.
<br>Default values are based in human genome.
<p>Source code is available at
<a href=http://www.biophp.org/minitools/random_seqs>BioPHP.org</a>
</td></tr>
</table>
</center>
</body>
</html>
<?php
//############################################################################
//################# Functions used in this script ############################
//############################################################################
// Generate a random DNA sequence
// $a, $c, $g and $t are the number of nucleotides A, C, G or T
// Usage example:
// $seq = randon_DNA(200,200,200,200);
function randon_DNA($a,$c,$g,$t){
return str_shuffle(str_repeat("A",$a).str_repeat("C",$c).str_repeat("G",$g).str_repeat("T",$t));
}
// Generate a random protein sequence
// $a, $c, $g and $t are the number of nucleotides A, C, G or T
// Usage example:
// $seq = randon_prot(100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100)
function randon_prot($A,$C,$D,$E,$F,$G,$H,$I,$K,$L,$M,$N,$P,$Q,$R,$S,$T,$V,$W,$Y){
return str_shuffle(str_repeat("A",$A).str_repeat("C",$C).str_repeat("D",$D).str_repeat("E",$E).
str_repeat("F",$F).str_repeat("G",$G).str_repeat("H",$H).str_repeat("I",$I).
str_repeat("K",$K).str_repeat("L",$L).str_repeat("M",$M).str_repeat("N",$N).
str_repeat("P",$P).str_repeat("Q",$Q).str_repeat("R",$R).str_repeat("S",$S).
str_repeat("T",$T).str_repeat("V",$V).str_repeat("W",$W).str_repeat("Y",$Y));
}
//############################################################################
//############################### End of fuctions ############################
//############################################################################
?>