BioTorrents.de’s version of Gazelle
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

seqhash.class.php 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. <?php
  2. #declare(strict_types = 1);
  3. # PHP Notice: Trying to access array offset on value of type bool in /var/www/html/dev.biotorrents.de/classes/seqhash.class.php on line 233
  4. /**
  5. * Seqhash
  6. *
  7. * Implements Keoni's Seqhash algorithm for DNA/RNA/protein sequences,
  8. * e.g., v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9
  9. *
  10. * > The first element is the version tag (v1 for version 1).
  11. * > If there is ever a Seqhash version 2, this tag will differentiate seqhashes.
  12. *
  13. * > The second element is the metadata tag, which has 3 letters.
  14. * > The first letter codes for the sequenceType (D for DNA, R for RNA, and P for Protein).
  15. * > The second letter codes for whether or not the sequence is circular (C for Circular, L for Linear).
  16. * > The final letter codes for whether or not the sequence is double stranded (D for Double stranded, S for Single stranded).
  17. *
  18. * > The final element is the blake3 hash of the sequence (once rotated and complemented).
  19. *
  20. * Requires the php-blake3 extension from:
  21. * https://github.com/cypherbits/php-blake3
  22. *
  23. * @see https://blog.libredna.org/post/seqhash/
  24. * @see https://github.com/TimothyStiles/poly/blob/prime/hash.go
  25. * @see https://github.com/TimothyStiles/poly/blob/prime/hash_test.go
  26. */
  27. class Seqhash
  28. {
  29. /**
  30. * boothLeastRotation
  31. *
  32. * Gets the least rotation of a circular string.
  33. * @see https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation
  34. */
  35. public function boothLeastRotation(string $Sequence)
  36. {
  37. # First concatenate the sequence to itself to avoid modular arithmatic
  38. # todo: Use buffers just for speed? May get annoying with larger sequences
  39. $Sequence = $Sequence . $Sequence;
  40. $LeastRotationIndex = 0;
  41. # Initializing failure slice
  42. $FailureSlice = array_fill(0, strlen($Sequence), -1);
  43. /*
  44. for ($i = 0; $i <= strlen($Sequence); $i++) {
  45. $FailureSlice[$i] = -1;
  46. }
  47. */
  48. # Iterate through each character in the doubled-over sequence
  49. for ($CharacterIndex = 1; $CharacterIndex < strlen($Sequence); $CharacterIndex++) {
  50. # Get character
  51. $Character = $Sequence[$CharacterIndex];
  52. # Get failure
  53. $Failure = $FailureSlice[$CharacterIndex - $LeastRotationIndex - 1];
  54. # While failure !== -1 and character !== the character found at the least rotation + failure + 1
  55. while ($Failure !== -1 && $Character !== $Sequence[$LeastRotationIndex + $Failure + 1]) {
  56. # If character is lexically less than whatever is at $LeastRotationIndex, update $LeastRotationIndex
  57. if ($Character < $Sequence[$LeastRotationIndex + $Failure + 1]) {
  58. $LeastRotationIndex = $CharacterIndex - $Failure - 1;
  59. }
  60. # Update failure using previous failure as index?
  61. $Failure = $FailureSlice[$Failure];
  62. } # while
  63. # If character does not equal whatever character is at leastRotationIndex plus failure
  64. if ($Character !== $Sequence[$LeastRotationIndex + $Failure + 1]) {
  65. # If character is lexically less then what is rotated, $LeastRotatationIndex gets value of $CharacterIndex
  66. if ($Character < $Sequence[$LeastRotationIndex]) {
  67. $LeastRotationIndex = $CharacterIndex;
  68. }
  69. # Assign -1 to whatever is at the index of difference between character and rotation indices
  70. $FailureSlice[$CharacterIndex - $LeastRotationIndex] = -1;
  71. # If character does equal whatever character is at $LeastRotationIndex + $Failure
  72. } else {
  73. # Assign $Failure + 1 at the index of difference between character and rotation indices
  74. $FailureSlice[$CharacterIndex - $LeastRotationIndex] = $Failure + 1;
  75. }
  76. } # for
  77. return $LeastRotationIndex;
  78. }
  79. /**
  80. * rotateSequence
  81. *
  82. * Rotates circular sequences to a deterministic point.
  83. */
  84. public function rotateSequence(string $Sequence)
  85. {
  86. $RotationIndex = $this->boothLeastRotation($Sequence);
  87. # Writing the same sequence twice
  88. # PHP has no strings.Builder.WriteString
  89. $ConcatenatedSequence = $Sequence . $Sequence;
  90. # https://stackoverflow.com/a/2423867
  91. $Length = $RotationIndex % strlen($Sequence);
  92. return substr($Sequence, $Length) . substr($Sequence, 0, $Length);
  93. }
  94. /**
  95. * reverseComplement
  96. *
  97. * Takes the reverse complement of a sequence.
  98. * Doesn't validate the sequence alphabet first.
  99. */
  100. public function reverseComplement(string $Sequence)
  101. {
  102. # Normalize the sequence
  103. $Sequence = strtoupper($Sequence);
  104. /**
  105. * Provides 1:1 mapping between bases and their complements.
  106. * Kind of ghetto, but lowercase replaces help stop extra flips.
  107. * @see https://github.com/TimothyStiles/poly/blob/prime/sequence.go
  108. */
  109. $RuneMap = [
  110. 'A' => 't',
  111. 'B' => 'v',
  112. 'C' => 'g',
  113. 'D' => 'h',
  114. 'G' => 'c',
  115. 'H' => 'd',
  116. 'K' => 'm',
  117. 'M' => 'k',
  118. 'N' => 'n',
  119. 'R' => 'y',
  120. 'S' => 's',
  121. 'T' => 'a',
  122. 'U' => 'a',
  123. 'V' => 'b',
  124. 'W' => 'w',
  125. 'Y' => 'r',
  126. ];
  127. return $ComplementString = strtoupper(
  128. str_replace(
  129. array_keys($RuneMap),
  130. array_values($RuneMap),
  131. $Sequence
  132. )
  133. );
  134. }
  135. /**
  136. * hash
  137. *
  138. * Create a Seqhash from a string.
  139. */
  140. public function hash(
  141. string $Sequence,
  142. string $SequenceType,
  143. bool $Circular = false,
  144. bool $DoubleStranded = true
  145. ) {
  146. # Check for Blake3 support
  147. if (!extension_loaded('blake3')) {
  148. throw new Exception('Please install and enable the php-blake3 extension.');
  149. }
  150. # By definition, Seqhashes are of uppercase sequences
  151. $Sequence = strtoupper($Sequence);
  152. # If RNA, convert to a DNA sequence
  153. # The hash itself between a DNA and RNA sequence will not be different,
  154. # but their Seqhash will have a different metadata string (R vs. D)
  155. if ($SequenceType === 'RNA') {
  156. $Sequence = str_replace('T', 'U', $Sequence);
  157. }
  158. # Run checks on the input
  159. if (!in_array($SequenceType, ['DNA', 'RNA', 'PROTEIN'])) {
  160. throw new Exception("SequenceType must be one of [DNA, RNA, PROTEIN]. Got $SequenceType.");
  161. }
  162. # Check the alphabet used
  163. $SequenceRegex = '/[ATUGCYRSWKMBDHVNZ]/';
  164. $ProteinRegex = '/[ACDEFGHIKLMNPQRSTVWYUO*BXZ]/';
  165. # todo: Refactor this to detect $SequenceType from alphabet
  166. if ($SequenceType === 'DNA' || $SequenceType === 'RNA') {
  167. foreach (str_split($Sequence) as $Letter) {
  168. if (!preg_match($SequenceRegex, $Letter)) {
  169. throw new Exception("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got $Letter.");
  170. }
  171. }
  172. }
  173. /**
  174. * Selenocysteine (Sec; U) and pyrrolysine (Pyl; O) are added
  175. * in accordance with https://www.uniprot.org/help/sequences
  176. *
  177. * The release notes https://web.expasy.org/docs/relnotes/relstat.html
  178. * also state there are Asx (B), Glx (Z), and Xaa (X) amino acids,
  179. * so these are added in as well.
  180. */
  181. else {
  182. foreach (str_split($Sequence) as $Letter) {
  183. if (!preg_match($ProteinRegex, $Letter)) {
  184. throw new Exception("Only letters ACDEFGHIKLMNPQRSTVWYUO*BXZ are allowed for proteins. Got $Letter.");
  185. }
  186. }
  187. }
  188. # There is no check for circular proteins since proteins can be circular
  189. if ($SequenceType === 'PROTEIN' && $DoubleStranded) {
  190. throw new Exception("Proteins can't be double stranded.");
  191. }
  192. # Gets deterministic sequence based off of metadata + sequence
  193. #switch ($Circular && $DoubleStranded) {
  194. switch ([$Circular, $DoubleStranded]) {
  195. #case (true && true):
  196. case [true, true]:
  197. $PotentialSequences = [
  198. $this->rotateSequence($Sequence),
  199. $this->rotateSequence($this->reverseComplement($Sequence)),
  200. ];
  201. $DeterministicSequence = sort($PotentialSequences)[0];
  202. break;
  203. #case (true && false):
  204. case [true, false]:
  205. $DeterministicSequence = $this->rotateSequence($Sequence);
  206. break;
  207. #case (false && true):
  208. case [false, true]:
  209. $PotentialSequences = [
  210. $Sequence,
  211. $this->reverseComplement($Sequence),
  212. ];
  213. $DeterministicSequence = sort($PotentialSequences)[0];
  214. break;
  215. #case (false && false):
  216. case [false, false]:
  217. $DeterministicSequence = $Sequence;
  218. break;
  219. default:
  220. break;
  221. }
  222. /**
  223. * Build 3 letter metadata
  224. */
  225. # Get first letter: D for DNA, R for RNA, and P for Protein
  226. switch ($SequenceType) {
  227. case 'DNA':
  228. $SequenceTypeLetter = 'D';
  229. break;
  230. case 'RNA':
  231. $SequenceTypeLetter = 'R';
  232. break;
  233. case 'PROTEIN':
  234. $SequenceTypeLetter = 'P';
  235. break;
  236. default:
  237. break;
  238. }
  239. # Get 2nd letter: C for circular, L for Linear
  240. if ($Circular) {
  241. $CircularLetter = 'C';
  242. } else {
  243. $CircularLetter = 'L';
  244. }
  245. # Get 3rd letter: D for Double stranded, S for Single stranded
  246. if ($DoubleStranded) {
  247. $DoubleStrandedLetter = 'D';
  248. } else {
  249. $DoubleStrandedLetter = 'S';
  250. }
  251. # php-blake3 returns hex by default,
  252. # binary if $rawOutput = true
  253. return
  254. 'v1'
  255. . '_'
  256. . $SequenceTypeLetter
  257. . $CircularLetter
  258. . $DoubleStrandedLetter
  259. . '_'
  260. . blake3($DeterministicSequence);
  261. }
  262. /**
  263. * validate
  264. *
  265. * Validates a Seqhash's metadata.
  266. * Doesn't check the Blake3 hash itself,
  267. * because php-blake3 has no such feature.
  268. */
  269. public function validate(string $Seqhash)
  270. {
  271. $Parts = explode('_', $Seqhash);
  272. # Check version info
  273. if ($Parts[0] !== 'v1') {
  274. throw new Exception("Invalid version info. Got $Parts[0].");
  275. }
  276. # Check 3 letter metadata
  277. $Meta = str_split($Parts[1]);
  278. if (!in_array($Meta[0], ['D', 'R', 'P'])
  279. || !in_array($Meta[1], ['C', 'L'])
  280. || !in_array($Meta[2], ['D', 'S'])) {
  281. throw new Exception("Invalid metadata. Got $Parts[1].");
  282. }
  283. # Check Blake3 hex and hash length
  284. if (!ctype_xdigit($Parts[2]) || strlen($Parts[2] !== 64)) {
  285. throw new Exception("Invalid Blake3 hash. Expected a 64-character hex string.");
  286. }
  287. return true;
  288. }
  289. /**
  290. * gcContent
  291. *
  292. * Bonus feature!
  293. * Calculate GC content of a DNA sequence.
  294. * Shamelessly ripped from kennypavan/BioPHP.
  295. *
  296. * @see https://github.com/kennypavan/BioPHP/blob/master/BioPHP.php
  297. */
  298. public function gcContent(string $Sequence, int $Precision = 2)
  299. {
  300. $Sequence = strtoupper($Sequence);
  301. $G = substr_count($Sequence, 'G');
  302. $C = substr_count($Sequence, 'C');
  303. return number_format((($G + $C) / strlen($Sequence)) * 100, $Precision);
  304. }
  305. }