# HG changeset patch # User ashvark # Date 1405684559 14400 # Node ID a294fbfcb1db2008d7c4dbefd941810b2f8f88f7 # Parent a9636dc1e99aee939cad5b7c5f46116197f3b1c9 Uploaded BWA diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/COPYING Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/ChangeLog --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/ChangeLog Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,3864 @@ +------------------------------------------------------------------------ +r1605 | lh3 | 2010-12-29 20:20:20 -0500 (Wed, 29 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.9rc1-2 (r1605) + * fixed a typo/bug in bwasw + +------------------------------------------------------------------------ +r1587 | lh3 | 2010-12-21 18:48:30 -0500 (Tue, 21 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +a typo in the manual + +------------------------------------------------------------------------ +r1586 | lh3 | 2010-12-21 18:47:48 -0500 (Tue, 21 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + + * bwa-0.5.9rc1-1 (r1586) + * a few patches by John + +------------------------------------------------------------------------ +r1562 | lh3 | 2010-12-10 01:02:06 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + +documentation on specifying @RG + +------------------------------------------------------------------------ +r1561 | lh3 | 2010-12-10 00:45:40 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.9rc1 (r1561) + +------------------------------------------------------------------------ +r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + + * fixed a small memory leak caused by the BAM reader + * fixed a memory violation, also in the BAM reader + +------------------------------------------------------------------------ +r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/Makefile + +change Makefile gcc options + +------------------------------------------------------------------------ +r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-6 (r1557) + * added a little more comments to BWA-SW + * randomly choosing a mapping if there are more than one + +------------------------------------------------------------------------ +r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + +sometimes unmapped reads may not be printed... + +------------------------------------------------------------------------ +r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + +print unmapped reads + +------------------------------------------------------------------------ +r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-5 (r1555) + * BAM input documentation + +------------------------------------------------------------------------ +r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-4 (r1544) + * supporting adding RG tags and RG lines + +------------------------------------------------------------------------ +r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-3 (r1543) + * fixed a memory leak + +------------------------------------------------------------------------ +r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-2 (r1542) + * fixed a long existing bug in random placement of reads + +------------------------------------------------------------------------ +r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bamlite.c + A /branches/prog/bwa/bamlite.h + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + +preliminary BAM input support + +------------------------------------------------------------------------ +r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +change version number and ChangeLog + +------------------------------------------------------------------------ +r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * fixed a bug in the scoring matrix + * release bwa-0.5.8c (r1536) + +------------------------------------------------------------------------ +r1451 | lh3 | 2010-06-15 09:43:52 -0400 (Tue, 15 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +version change + +------------------------------------------------------------------------ +r1450 | lh3 | 2010-06-15 09:42:21 -0400 (Tue, 15 Jun 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.5.8b (r1450) + * fixed a bug in scoring matrix + +------------------------------------------------------------------------ +r1445 | lh3 | 2010-06-11 08:58:33 -0400 (Fri, 11 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + +fixed a serious bug + +------------------------------------------------------------------------ +r1442 | lh3 | 2010-06-08 10:22:14 -0400 (Tue, 08 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.8 (r1442) + +------------------------------------------------------------------------ +r1440 | lh3 | 2010-05-19 13:43:50 -0400 (Wed, 19 May 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-r1440 + * sorry, forget to remove a debugging line + +------------------------------------------------------------------------ +r1439 | lh3 | 2010-05-19 13:43:08 -0400 (Wed, 19 May 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-r1439 + * fixed a bug in bwasw caused by a recent modification + * throwing insane insert size when estimating isize + +------------------------------------------------------------------------ +r1425 | lh3 | 2010-04-29 15:15:23 -0400 (Thu, 29 Apr 2010) | 10 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-7 (r1425) + * fixed a minor bug in bwasw command-line parsing + * When band-width is not large enough, bwasw may find two highly + overlapping but not completely overlapping alignments. The old + version will filter out one of them, which leads to false + negatives. The current outputs both. This solution is obviously not + ideal. The ideal one would be to increase the band-width and redo the + alignment. + + +------------------------------------------------------------------------ +r1399 | lh3 | 2010-04-16 09:20:49 -0400 (Fri, 16 Apr 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-6 (r1399) + * fixed a typo/bug (by Vaughn Iverson) + +------------------------------------------------------------------------ +r1329 | lh3 | 2010-03-19 23:32:46 -0400 (Fri, 19 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + +small correction + +------------------------------------------------------------------------ +r1328 | lh3 | 2010-03-19 23:28:44 -0400 (Fri, 19 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-4 (r1328) + * automatically adjust ap_prior based on alignment + +------------------------------------------------------------------------ +r1327 | lh3 | 2010-03-19 23:02:40 -0400 (Fri, 19 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.5.7-3 (r1327) + * evaluate hits obtained from SW alignment in a more proper way. + +------------------------------------------------------------------------ +r1320 | lh3 | 2010-03-17 15:13:22 -0400 (Wed, 17 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + +fixed a potential out-of-boundary error. Need more testing. + +------------------------------------------------------------------------ +r1319 | lh3 | 2010-03-14 22:44:46 -0400 (Sun, 14 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + +insert size is `weird' if the 3rd quatile larger than 100,000bp + +------------------------------------------------------------------------ +r1318 | lh3 | 2010-03-14 22:37:35 -0400 (Sun, 14 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.7-2 (r1318) + * in sampe, allow to disable insert size estimate + +------------------------------------------------------------------------ +r1317 | lh3 | 2010-03-14 22:14:14 -0400 (Sun, 14 Mar 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/solid2fastq.pl + + * bwa-0.5.7-1 (r1317) + * fixed a potential bug in solid2fastq.pl + * fixed a bug in calculating mapping quality (by Rodrigo Goya) + * fixed a very rare bug (if ever occur) about pairing + +------------------------------------------------------------------------ +r1310 | lh3 | 2010-03-01 10:35:45 -0500 (Mon, 01 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.7 + +------------------------------------------------------------------------ +r1309 | lh3 | 2010-02-26 21:42:22 -0500 (Fri, 26 Feb 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.6-2 (r1309) + * fixed an unfixed bug (by Carol Scott) + * fixed some tiny formatting + +------------------------------------------------------------------------ +r1305 | lh3 | 2010-02-25 13:47:58 -0500 (Thu, 25 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.6-1 (r1304) + * optionally write output to a file (by Tim Fennel) + +------------------------------------------------------------------------ +r1303 | lh3 | 2010-02-10 23:43:48 -0500 (Wed, 10 Feb 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.6 + +------------------------------------------------------------------------ +r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.5-10 (r1302) + * improve max insert size estimate (method suggested by Gerton Lunter) + +------------------------------------------------------------------------ +r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-9 (r1301) + * improve mapping quality calculation for abnomalous pairs + * fixed a bug in multiple hits + * SOLiD multiple hits should work now + +------------------------------------------------------------------------ +r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-8 (r1300) + * output kurtosis + +------------------------------------------------------------------------ +r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-7 (r1299) + * calculate skewness in sampe + * increase min_len in SW to 20 + * perform more SW to fix discordant pairs + +------------------------------------------------------------------------ +r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.5.5-6 (r1297) + * prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya) + +------------------------------------------------------------------------ +r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/solid2fastq.pl + +the old fix seems not working! + +------------------------------------------------------------------------ +r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-5 (r1296) + * fixed a minor issue that the lower bound of insert size is not correctly set. + +------------------------------------------------------------------------ +r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-4 (r1295) + * fixed a memory leak + * change the behaviour of -n (samse and sampe) + * change the default of -n + +------------------------------------------------------------------------ +r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.5-3 (r1294) + * improved multi-hit report + +------------------------------------------------------------------------ +r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/solid2fastq.pl + + * bwa-0.5.5-2 (r1293) + * bugfix: truncated quality string + * bugfix: quality -1 in solid->fastq conversion + * bugfix: color reads on the reverse strand is not complemented + +------------------------------------------------------------------------ +r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwase.c + A /branches/prog/bwa/bwase.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-1 (r1279) + * incorporate changes from Matt Hanna for Java bindings. + +------------------------------------------------------------------------ +r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + A /branches/prog/bwa/qualfa2fq.pl + +Release bwa-0.5.5 (r1273) + +------------------------------------------------------------------------ +r1272 | lh3 | 2009-11-10 22:02:50 -0500 (Tue, 10 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-3 (r1272) + * fixed another typo which may lead to incorrect single-end mapping quality + +------------------------------------------------------------------------ +r1271 | lh3 | 2009-11-10 21:59:47 -0500 (Tue, 10 Nov 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-2 (r1271) + * fixed a serious typo/bug which does not hurt if we allow one gap open + and work with <200bp reads, but causes segfault for long reads. + +------------------------------------------------------------------------ +r1270 | lh3 | 2009-11-09 23:12:42 -0500 (Mon, 09 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-1 (r1270) + * fixed a bug in color alignment + +------------------------------------------------------------------------ +r1245 | lh3 | 2009-10-09 07:42:52 -0400 (Fri, 09 Oct 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.4 + +------------------------------------------------------------------------ +r1244 | lh3 | 2009-10-09 05:53:52 -0400 (Fri, 09 Oct 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.5.3-4 (r1244) + * output the clipped length in XC:i: tag + * skip mate alignment when stdaln is buggy + * fixed a bug in NM:i: tag + +------------------------------------------------------------------------ +r1243 | lh3 | 2009-10-07 08:15:04 -0400 (Wed, 07 Oct 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.3-3 (r1243) + * sampe: fixed a bug when a read sequence is identical its reverse complement. + +------------------------------------------------------------------------ +r1242 | lh3 | 2009-10-07 07:49:13 -0400 (Wed, 07 Oct 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.3-2 (r1242) + * sampe: optionall preload the full index into memory + * aln: change the default seed length to 32bp + +------------------------------------------------------------------------ +r1238 | lh3 | 2009-09-26 18:38:15 -0400 (Sat, 26 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/khash.h + +Improve portability of khash.h + +------------------------------------------------------------------------ +r1228 | lh3 | 2009-09-15 09:20:22 -0400 (Tue, 15 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +fixed a typo + +------------------------------------------------------------------------ +r1227 | lh3 | 2009-09-15 09:19:35 -0400 (Tue, 15 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.3-1 (r1226) + * in dBWT-SW, optionall use hard clipping instead of soft clipping + +------------------------------------------------------------------------ +r1225 | lh3 | 2009-09-15 08:32:30 -0400 (Tue, 15 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.3 (r1225) + +------------------------------------------------------------------------ +r1223 | lh3 | 2009-09-13 07:30:41 -0400 (Sun, 13 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.2 + +------------------------------------------------------------------------ +r1222 | lh3 | 2009-09-11 09:11:39 -0400 (Fri, 11 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-5 (r1222) + * fixed a typo. No real change + +------------------------------------------------------------------------ +r1221 | lh3 | 2009-09-11 09:09:44 -0400 (Fri, 11 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.1-4 (r1221) + * trim reads before alignment + +------------------------------------------------------------------------ +r1216 | lh3 | 2009-09-08 17:50:15 -0400 (Tue, 08 Sep 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.1-3 (r1216) + * fixed a bug about NM tags for gapped alignment + * print SAM header + +------------------------------------------------------------------------ +r1215 | lh3 | 2009-09-08 17:14:42 -0400 (Tue, 08 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-2 (r1215) + * fixed a bug when read lengths vary (by John Marshall) + +------------------------------------------------------------------------ +r1213 | lh3 | 2009-09-06 18:58:15 -0400 (Sun, 06 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-1 (r1213) + * change default -T to 30 + +------------------------------------------------------------------------ +r1209 | lh3 | 2009-09-02 06:06:02 -0400 (Wed, 02 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.1 + +------------------------------------------------------------------------ +r1208 | lh3 | 2009-09-02 05:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + + * ChangeLog + +------------------------------------------------------------------------ +r1206 | lh3 | 2009-08-30 18:27:30 -0400 (Sun, 30 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-6 (r1206) + * fixed two bugs caused by previous modification + +------------------------------------------------------------------------ +r1205 | lh3 | 2009-08-30 17:28:36 -0400 (Sun, 30 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-4 (r1205) + * reduce false coordinates and CIGAR when a query bridges two reference + sequences, although some very rare cases may fail bwa. + +------------------------------------------------------------------------ +r1204 | lh3 | 2009-08-30 06:06:16 -0400 (Sun, 30 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-3 (r1204) + * choose one repetitive hit to extend + +------------------------------------------------------------------------ +r1203 | lh3 | 2009-08-29 18:11:51 -0400 (Sat, 29 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-2 (r1203) + * dBWT-SW: change a parameter in calculating mapping quality + * fixed a bug in samse + +------------------------------------------------------------------------ +r1202 | lh3 | 2009-08-28 19:48:41 -0400 (Fri, 28 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-1 (r1202) + * change default band width to 50 + * improve mapping quality a bit + +------------------------------------------------------------------------ +r1200 | lh3 | 2009-08-20 06:21:24 -0400 (Thu, 20 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.0 (r1200) + +------------------------------------------------------------------------ +r1199 | lh3 | 2009-08-20 04:49:15 -0400 (Thu, 20 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +Updated ChangeLog and the manual + +------------------------------------------------------------------------ +r1198 | lh3 | 2009-08-19 11:09:15 -0400 (Wed, 19 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-36 (r1198) + * simplify duphits removal. The accuracy is changed a tiny bit, sometimes better, sometimes worse. + +------------------------------------------------------------------------ +r1197 | lh3 | 2009-08-19 08:15:05 -0400 (Wed, 19 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + A /branches/prog/bwa/bwtsw2_chain.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-35 (r1197) + * further heuristic acceleration for long queries + +------------------------------------------------------------------------ +r1196 | lh3 | 2009-08-18 06:54:03 -0400 (Tue, 18 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-34 (r1196) + * updated the manual page + * output base quality if the input is fastq + +------------------------------------------------------------------------ +r1195 | lh3 | 2009-08-18 06:23:00 -0400 (Tue, 18 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + + * bwa-0.4.9-33 (r1191) + * fixed a bug in sampe/samse when gaps occur to the 5'-end in SW alignment + * in dbwtsw adjust -T and -c according to -a + +------------------------------------------------------------------------ +r1192 | lh3 | 2009-08-13 05:37:28 -0400 (Thu, 13 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update manual + +------------------------------------------------------------------------ +r1191 | lh3 | 2009-08-12 19:40:51 -0400 (Wed, 12 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_main.c + +update documentation + +------------------------------------------------------------------------ +r1190 | lh3 | 2009-08-12 08:56:10 -0400 (Wed, 12 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-32 (r1190) + * only help messages are changed + +------------------------------------------------------------------------ +r1189 | lh3 | 2009-08-11 09:28:55 -0400 (Tue, 11 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-31 (r1189) + * in bwape/bwase, print CIGAR "*" if the read is unmapped + * improved the calculation of mapping quality + +------------------------------------------------------------------------ +r1181 | lh3 | 2009-08-03 12:09:41 -0400 (Mon, 03 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + +fflush() + +------------------------------------------------------------------------ +r1180 | lh3 | 2009-08-03 12:08:46 -0400 (Mon, 03 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-30 (r1180) + * fixed a memory problem + * multi-threading sometimes does not work... + +------------------------------------------------------------------------ +r1179 | lh3 | 2009-08-03 11:04:39 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-29 (r1179) + * preliminary mutli-threading support in dbwtsw + +------------------------------------------------------------------------ +r1178 | lh3 | 2009-08-03 09:14:54 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-28 (r1178) + * fixed a bug in printing repetitive hits + +------------------------------------------------------------------------ +r1177 | lh3 | 2009-08-03 05:03:42 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-27 (r1177) + * bwtsw2: fixed a hidden memory leak + +------------------------------------------------------------------------ +r1176 | lh3 | 2009-07-31 10:58:24 -0400 (Fri, 31 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-26 + * change the way mapping quality is calculated + +------------------------------------------------------------------------ +r1175 | lh3 | 2009-07-31 09:15:54 -0400 (Fri, 31 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-25 + * code clean up + * automatically adjust ->t and ->is_rev based on input + +------------------------------------------------------------------------ +r1174 | lh3 | 2009-07-30 08:50:25 -0400 (Thu, 30 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-24 + * fixed a bug in printing the hits + +------------------------------------------------------------------------ +r1173 | lh3 | 2009-07-29 18:32:43 -0400 (Wed, 29 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-23 + * allow to skip reverse alignment + * increase opt->t to 37 + +------------------------------------------------------------------------ +r1172 | lh3 | 2009-07-29 17:22:39 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-22 + * report if the hit is found in both directions + +------------------------------------------------------------------------ +r1171 | lh3 | 2009-07-29 17:12:02 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-21 + * dbwtsw: map to both forward and reverse BWT to reduce false alignment + +------------------------------------------------------------------------ +r1170 | lh3 | 2009-07-29 15:25:14 -0400 (Wed, 29 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + +save hits before cut_tail() + +------------------------------------------------------------------------ +r1169 | lh3 | 2009-07-29 08:06:01 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-19 + * use a global memory pool to reduce the CPU time spent on malloc/free(). + +------------------------------------------------------------------------ +r1168 | lh3 | 2009-07-29 06:13:29 -0400 (Wed, 29 Jul 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-18 + * reduce unnecessary extension to the 5'-end + * allow to use different interval size for the 2 rounds + * change default parameters + +------------------------------------------------------------------------ +r1167 | lh3 | 2009-07-28 19:06:17 -0400 (Tue, 28 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-17 + * dbwtsw: fixed THE memory leak. + +------------------------------------------------------------------------ +r1166 | lh3 | 2009-07-28 16:31:41 -0400 (Tue, 28 Jul 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.4.9-16 + * fixed a memory leak + * a small memory leak still occurs to bwtsw2_core(). I will work on that later. + * changed the default parameters + +------------------------------------------------------------------------ +r1165 | lh3 | 2009-07-28 10:15:40 -0400 (Tue, 28 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.4.9-15 + * generate CIGAR right before output. This saves unnecessary computation. + * this version may be buggy as I have not tested it. + +------------------------------------------------------------------------ +r1164 | lh3 | 2009-07-28 09:04:14 -0400 (Tue, 28 Jul 2009) | 11 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-14 + + * deplete unique hits in dbwtsw and postprocess them with standard sw + + * in principle, this stratgy should be faster and more accurate, but I + have not tested this point. I may switch back to the old method if + this does not work. + + * the code looks quite nasty now. it needs clean up... + + +------------------------------------------------------------------------ +r1163 | lh3 | 2009-07-27 17:41:10 -0400 (Mon, 27 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + +change a default parameter + +------------------------------------------------------------------------ +r1162 | lh3 | 2009-07-27 17:04:35 -0400 (Mon, 27 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-13 + * dbwtsw: switch between small and large Z-best + +------------------------------------------------------------------------ +r1161 | lh3 | 2009-07-27 12:17:41 -0400 (Mon, 27 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-12 + * changed the default -z to 100 + * heuristically speed up alignments for polyA reads + +------------------------------------------------------------------------ +r1160 | lh3 | 2009-07-27 07:50:57 -0400 (Mon, 27 Jul 2009) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-11 + + * dbwtsw potentially generates less false alignments, although in + practice, the modification brings no improvement. + + +------------------------------------------------------------------------ +r1159 | lh3 | 2009-07-27 04:37:02 -0400 (Mon, 27 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-10 + * disabled debugging code + * add "BAM_FMU" if both ends are unmapped + +------------------------------------------------------------------------ +r1158 | lh3 | 2009-07-24 09:36:52 -0400 (Fri, 24 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +nothing, really + +------------------------------------------------------------------------ +r1157 | lh3 | 2009-07-24 09:05:44 -0400 (Fri, 24 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-9 + * bwtsw2: generate SAM output + +------------------------------------------------------------------------ +r1156 | lh3 | 2009-07-24 05:42:47 -0400 (Fri, 24 Jul 2009) | 6 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-8 + + * fixed a weird deadloop which only happens to icc -O3. Thanks John + Marshall for the fix. + + +------------------------------------------------------------------------ +r1155 | lh3 | 2009-07-24 05:28:40 -0400 (Fri, 24 Jul 2009) | 8 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-7 + + * fixed a typo in bwtsw2 alignment. Now score from the standard SW + seems to agree with score from bwtsw2, except that in reporting + alignments, bwtsw2 may report non-optimal segments. This is expected, + though. I will improve in future. + + +------------------------------------------------------------------------ +r1154 | lh3 | 2009-07-23 17:40:20 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * aln_left_core() seems to work properly + * aln_local_core() has a bug... AN EVER EXISTING BUG!!!!!!!!!!! + +------------------------------------------------------------------------ +r1153 | lh3 | 2009-07-23 17:06:09 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +removed debugging code... + +------------------------------------------------------------------------ +r1152 | lh3 | 2009-07-23 17:01:00 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + + * radical changes failed... + * fixed a bug + +------------------------------------------------------------------------ +r1151 | lh3 | 2009-07-23 14:46:35 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +temporary changes. Will apply some radical changes to this file... + +------------------------------------------------------------------------ +r1150 | lh3 | 2009-07-23 10:09:56 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/stdaln.c + +fixed a long-existing bug in Smith-Waterman alignment + +------------------------------------------------------------------------ +r1149 | lh3 | 2009-07-23 08:50:52 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-6 + * unexplained inconsistency still occurs, but the results largely look reasonable. + +------------------------------------------------------------------------ +r1148 | lh3 | 2009-07-23 08:07:29 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +half DP + +------------------------------------------------------------------------ +r1147 | lh3 | 2009-07-22 08:03:06 -0400 (Wed, 22 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + +a bit code clean up + +------------------------------------------------------------------------ +r1145 | lh3 | 2009-07-21 15:52:05 -0400 (Tue, 21 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-5 + * fixed a bug in determining sub-optimal hits + * removed some debugging codes + +------------------------------------------------------------------------ +r1144 | lh3 | 2009-07-21 10:17:29 -0400 (Tue, 21 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-4 + * better cmd interface + * faster speed + +------------------------------------------------------------------------ +r1143 | lh3 | 2009-07-20 16:38:18 -0400 (Mon, 20 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + +bwtsw2 (dBWT-SW) is working apparently... + + +------------------------------------------------------------------------ +r1139 | lh3 | 2009-07-15 05:52:18 -0400 (Wed, 15 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-2 + * bwtsw2: change cut_tail() such that it is faster but more likely to + miss true hits + +------------------------------------------------------------------------ +r1138 | lh3 | 2009-07-15 05:18:42 -0400 (Wed, 15 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwt_lite.c + A /branches/prog/bwa/bwt_lite.h + A /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_aux.c + A /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.4.9-1 + * added back bwtsw2 + +------------------------------------------------------------------------ +r1075 | lh3 | 2009-05-19 05:14:50 -0400 (Tue, 19 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.9 + +------------------------------------------------------------------------ +r1073 | lh3 | 2009-05-18 17:13:19 -0400 (Mon, 18 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.8 + +------------------------------------------------------------------------ +r1069 | lh3 | 2009-05-14 09:54:54 -0400 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.7-2 + * change the default of "aln -R" to 30 + +------------------------------------------------------------------------ +r1068 | lh3 | 2009-05-14 09:27:55 -0400 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.7-1 + * search for suboptimal hits if the top hit is not so repetitive + +------------------------------------------------------------------------ +r1066 | lh3 | 2009-05-12 15:31:31 -0400 (Tue, 12 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.7 + +------------------------------------------------------------------------ +r1065 | lh3 | 2009-05-12 15:20:40 -0400 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-9 + * fixed compiling errors on some Linux machines + +------------------------------------------------------------------------ +r1064 | lh3 | 2009-05-12 07:30:46 -0400 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-8 + * avoid compilation error on some systems. + +------------------------------------------------------------------------ +r1035 | lh3 | 2009-05-09 05:41:33 -0400 (Sat, 09 May 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-7 + * fixed an integer overflow caused by previous modifications + * made insert size estimation more robust + +------------------------------------------------------------------------ +r1008 | lh3 | 2009-04-29 05:41:58 -0400 (Wed, 29 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-5 + * fixed a integer overflow problem which may cause seg fault in very rare cases + * made XN tags more accurate + +------------------------------------------------------------------------ +r1005 | lh3 | 2009-04-27 07:37:23 -0400 (Mon, 27 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.6-4 + * heuristic rules to detect suboptimal alignment + * stdsw: support double-strand and protein alignment + +------------------------------------------------------------------------ +r1003 | lh3 | 2009-04-26 12:48:19 -0400 (Sun, 26 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.6-2 + * improve the functionality of stdsw + * allow to add a threshold on SW alignment. Hope this does not incur new bugs... + +------------------------------------------------------------------------ +r1002 | lh3 | 2009-04-22 03:56:15 -0400 (Wed, 22 Apr 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.6-1 + * output SM and AM tag + +------------------------------------------------------------------------ +r914 | lh3 | 2009-03-09 17:53:50 -0400 (Mon, 09 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.4.6 + +------------------------------------------------------------------------ +r913 | lh3 | 2009-03-09 17:23:24 -0400 (Mon, 09 Mar 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + A /branches/prog/bwa/solid2fastq.pl + + * added notes to bwa + * added a script to convert SOLiD reads + * updated documentations + +------------------------------------------------------------------------ +r912 | lh3 | 2009-03-09 16:57:05 -0400 (Mon, 09 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/kstring.c + M /branches/prog/bwa/main.c + +fixed a bug in kstring + +------------------------------------------------------------------------ +r881 | lh3 | 2009-03-02 15:36:06 -0500 (Mon, 02 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtmisc.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-7 + * fixed a bug in pac2cspac + +------------------------------------------------------------------------ +r880 | lh3 | 2009-03-01 16:34:08 -0500 (Sun, 01 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + +disable debugging + +------------------------------------------------------------------------ +r879 | lh3 | 2009-03-01 16:28:04 -0500 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-6 + * fixed problems with coordinates for color gapped alignment + +------------------------------------------------------------------------ +r878 | lh3 | 2009-03-01 13:43:09 -0500 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-5 + * added support for gapped color alignment + +------------------------------------------------------------------------ +r877 | lh3 | 2009-03-01 10:27:52 -0500 (Sun, 01 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * convert cs read to nt read (for ungapped alignment only) + +------------------------------------------------------------------------ +r860 | lh3 | 2009-02-27 08:58:39 -0500 (Fri, 27 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwase.c + A /branches/prog/bwa/cs2nt.c + +prepare to implement cs->nt conversion (have not yet...) + +------------------------------------------------------------------------ +r859 | lh3 | 2009-02-27 07:00:03 -0500 (Fri, 27 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/bwtmisc.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.4.5-3 + * generate color index from nucleotide fasta reference + +------------------------------------------------------------------------ +r857 | lh3 | 2009-02-26 10:22:58 -0500 (Thu, 26 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-2 + * improved mapping quality a bit if one end falls in a tandem repeat + but the mate is unique. + +------------------------------------------------------------------------ +r856 | lh3 | 2009-02-26 10:02:29 -0500 (Thu, 26 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.5-1 + * make bwa work for SOLiD reads + +------------------------------------------------------------------------ +r828 | lh3 | 2009-02-18 17:36:41 -0500 (Wed, 18 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.5 + +------------------------------------------------------------------------ +r827 | lh3 | 2009-02-18 16:48:48 -0500 (Wed, 18 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.4-6 + * fixed a bug in SW alignment when no residue matches + +------------------------------------------------------------------------ +r824 | lh3 | 2009-02-17 05:33:07 -0500 (Tue, 17 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-5 + * fixed that bounary bug + +------------------------------------------------------------------------ +r823 | lh3 | 2009-02-17 04:54:18 -0500 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwape.c + +just change some logging information + +------------------------------------------------------------------------ +r822 | lh3 | 2009-02-17 04:20:39 -0500 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update manual + +------------------------------------------------------------------------ +r821 | lh3 | 2009-02-17 04:11:14 -0500 (Tue, 17 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-4 + * fixed a bug on boundary check in pair_sw + +------------------------------------------------------------------------ +r820 | lh3 | 2009-02-16 17:43:37 -0500 (Mon, 16 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-3 + * allow to change mismatch penalty + +------------------------------------------------------------------------ +r819 | lh3 | 2009-02-16 17:40:28 -0500 (Mon, 16 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-2 + * remove timer + * allow to change default gapo and gape penalty at the command line + +------------------------------------------------------------------------ +r818 | lh3 | 2009-02-16 09:30:51 -0500 (Mon, 16 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update benchmark + +------------------------------------------------------------------------ +r817 | lh3 | 2009-02-16 08:44:40 -0500 (Mon, 16 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/kvec.h + M /branches/prog/bwa/main.c + + * bwa-0.4.4-1 + * automatically detect insert size + * use insert size in pairing. This may potentially improve accuracy (untested!) + +------------------------------------------------------------------------ +r814 | lh3 | 2009-02-15 11:10:23 -0500 (Sun, 15 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.4.4 + +------------------------------------------------------------------------ +r813 | lh3 | 2009-02-15 10:22:50 -0500 (Sun, 15 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-5 + * impose boundary check in refine_gapped + +------------------------------------------------------------------------ +r811 | lh3 | 2009-02-14 09:46:13 -0500 (Sat, 14 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-4 + * change MD tag to match the latest SAM specification + +------------------------------------------------------------------------ +r810 | lh3 | 2009-02-13 04:46:04 -0500 (Fri, 13 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r799 | lh3 | 2009-02-05 12:01:17 -0500 (Thu, 05 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +change MD tag to meet the latest SAM specification + +------------------------------------------------------------------------ +r796 | lh3 | 2009-02-05 08:35:13 -0500 (Thu, 05 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-2 + * fixed a bug on counting 'N' + +------------------------------------------------------------------------ +r795 | lh3 | 2009-02-05 07:41:27 -0500 (Thu, 05 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-1 + * fixed potential boundary problems + * update benchmark result + +------------------------------------------------------------------------ +r791 | lh3 | 2009-01-25 05:20:47 -0500 (Sun, 25 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update some numbers + +------------------------------------------------------------------------ +r790 | lh3 | 2009-01-24 15:13:03 -0500 (Sat, 24 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update benchmark + +------------------------------------------------------------------------ +r789 | lh3 | 2009-01-22 10:18:44 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtindex.c + +a warning message for index + +------------------------------------------------------------------------ +r788 | lh3 | 2009-01-22 09:54:06 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +forget to change release number + +------------------------------------------------------------------------ +r786 | lh3 | 2009-01-22 06:27:39 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + +Release bwa-0.4.3 + +------------------------------------------------------------------------ +r785 | lh3 | 2009-01-22 06:27:16 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + +Release bwa-0.4.3 + +------------------------------------------------------------------------ +r784 | lh3 | 2009-01-22 06:19:59 -0500 (Thu, 22 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-10 + * update documentation + * fixed a bug on generating MD tags for SW alignment + +------------------------------------------------------------------------ +r782 | lh3 | 2009-01-19 12:08:38 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-9 + * fixed a bug in samse -n... + +------------------------------------------------------------------------ +r781 | lh3 | 2009-01-19 11:26:37 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-8 + * given -N, the previous version would stop if the top hit is a repeat. Now changed. + +------------------------------------------------------------------------ +r780 | lh3 | 2009-01-19 11:20:18 -0500 (Mon, 19 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-7 + * use a bit-wise flag to replace some member variables in the option struct + * allow to switch off the iterative strategy + +------------------------------------------------------------------------ +r779 | lh3 | 2009-01-19 10:45:57 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-6 + * allow to dump multiple hits from samse, in another format, though + +------------------------------------------------------------------------ +r778 | lh3 | 2009-01-19 06:24:29 -0500 (Mon, 19 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/kseq.h + A /branches/prog/bwa/kstring.c + A /branches/prog/bwa/kstring.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + + * bwa-0.4.2-5 + * update kseq.h to the latest version + * generate MD tag + * print mate coordinate if only one end is unmapped + +------------------------------------------------------------------------ +r775 | lh3 | 2009-01-18 05:40:35 -0500 (Sun, 18 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-4 + * fixed a bug for SAM format + +------------------------------------------------------------------------ +r774 | lh3 | 2009-01-17 13:48:52 -0500 (Sat, 17 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-3 + * change default fnr to 0.04 + * print max_diff for valid fnr + +------------------------------------------------------------------------ +r773 | lh3 | 2009-01-17 05:54:37 -0500 (Sat, 17 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.2-2 + * automatically choose max_diff + +------------------------------------------------------------------------ +r772 | lh3 | 2009-01-16 18:16:14 -0500 (Fri, 16 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-1 + * take N as a mismatch + +------------------------------------------------------------------------ +r768 | lh3 | 2009-01-09 11:57:23 -0500 (Fri, 09 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.2 + +------------------------------------------------------------------------ +r759 | lh3 | 2009-01-07 09:55:43 -0500 (Wed, 07 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + +Release bwa-0.4.1 + +------------------------------------------------------------------------ +r758 | lh3 | 2009-01-07 05:36:06 -0500 (Wed, 07 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.0-2 + * make mate_sw fully working + +------------------------------------------------------------------------ +r757 | lh3 | 2009-01-06 18:04:29 -0500 (Tue, 06 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.0-1 + * do SW alignment for unmapped mate. It is working. + * I still need to do some extra work for SW alignment, but it is too late + and I am getting tired... I will do tomorrow. + +------------------------------------------------------------------------ +r755 | lh3 | 2009-01-06 10:23:29 -0500 (Tue, 06 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.0 + +------------------------------------------------------------------------ +r754 | lh3 | 2009-01-06 07:45:02 -0500 (Tue, 06 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.3.0-12 + * better lock + +------------------------------------------------------------------------ +r753 | lh3 | 2009-01-06 06:17:21 -0500 (Tue, 06 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-11 + * fixed a small memory leak in bwa_seq_close() + * fixed "uninitialized memory" from bwt_aln1_t + * multithreading for "aln" command + +------------------------------------------------------------------------ +r752 | lh3 | 2009-01-05 17:34:13 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + D /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwt_gen/bwt_gen.c + A /branches/prog/bwa/bwtmisc.c (from /branches/prog/bwa/pac2bwt.c:748) + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + D /branches/prog/bwa/pac2bwt.c + + * bwa-0.3.0-10 + * a little bit code clean up + +------------------------------------------------------------------------ +r751 | lh3 | 2009-01-05 17:19:04 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-9 + * use 64-bit integer to speed up Occ calculate, although just a little bit + +------------------------------------------------------------------------ +r750 | lh3 | 2009-01-05 16:44:26 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-8 + * a little bit code cleanup + +------------------------------------------------------------------------ +r749 | lh3 | 2009-01-05 16:37:28 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-7 + * accelerate Occ calculation + +------------------------------------------------------------------------ +r748 | lh3 | 2009-01-05 16:12:28 -0500 (Mon, 05 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * bwa-0.3.0-6 + * put occ table along with bwt to save another cache miss + * this version is already faster than the previous and I can still improve it... + +------------------------------------------------------------------------ +r747 | lh3 | 2009-01-05 10:16:18 -0500 (Mon, 05 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-5 + * remove occ_major to save a cache miss; however, OCC_INTERVAL has to be + increased to keep the same memory. As a result, the speed is a little + slower in fact. + +------------------------------------------------------------------------ +r746 | lh3 | 2009-01-05 09:50:53 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-4 + * added back optimization codes (it is a pain...) + +------------------------------------------------------------------------ +r745 | lh3 | 2009-01-05 08:23:00 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-3 + * faster bit operations + +------------------------------------------------------------------------ +r744 | lh3 | 2009-01-05 05:58:46 -0500 (Mon, 05 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-2 + * removed optimization codes again... + * use a new method to count the bits + +------------------------------------------------------------------------ +r743 | lh3 | 2009-01-04 17:18:38 -0500 (Sun, 04 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-1 + * added back the optimization codes + * added a new option to aln: max_entries, although this is disabled by default + * updated benchmark + +------------------------------------------------------------------------ +r742 | lh3 | 2009-01-04 07:56:12 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +add URL + +------------------------------------------------------------------------ +r740 | lh3 | 2009-01-04 07:39:43 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.3.0 + +------------------------------------------------------------------------ +r739 | lh3 | 2009-01-04 06:55:06 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + A /branches/prog/bwa/COPYING + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +added licensing information + +------------------------------------------------------------------------ +r738 | lh3 | 2009-01-04 06:18:25 -0500 (Sun, 04 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-31 + * better mapping quality + * update benchmark + +------------------------------------------------------------------------ +r737 | lh3 | 2009-01-03 16:00:58 -0500 (Sat, 03 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r736 | lh3 | 2009-01-02 10:26:38 -0500 (Fri, 02 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r735 | lh3 | 2009-01-02 07:10:20 -0500 (Fri, 02 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-30 + * reduce memory a little bit + * update documentation + +------------------------------------------------------------------------ +r734 | lh3 | 2009-01-01 13:45:45 -0500 (Thu, 01 Jan 2009) | 8 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-29 + * sampe: removed -O option; changed default -o to 100000 + * sampe: fixed a bug in calculating paired mapping quality + * aln: added an option to search for suboptimal hits even if the best is a repeat. + This option will make sampe MUCH SLOWER. + * sampe: set isize as zero if mapped to two different chr + * update manual (unfinished) + +------------------------------------------------------------------------ +r733 | lh3 | 2009-01-01 11:01:20 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-28 + * fixed a bug in calculating paired mapping quality + +------------------------------------------------------------------------ +r732 | lh3 | 2009-01-01 09:27:46 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + A /branches/prog/bwa/khash.h (from /branches/prog/sclib/khash/khash.h:675) + M /branches/prog/bwa/main.c + + * bwa-0.2.0-27 + * accelerate sampe by storing visited large intervals + +------------------------------------------------------------------------ +r731 | lh3 | 2009-01-01 06:51:21 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-26 + * remove the optimation codes + +------------------------------------------------------------------------ +r730 | lh3 | 2009-01-01 06:48:59 -0500 (Thu, 01 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-25 + * accelerate OCC calculation by ~7%. However, it seems not worth doing + this by complicate the codes. I will change back later. + +------------------------------------------------------------------------ +r729 | lh3 | 2008-12-31 16:43:56 -0500 (Wed, 31 Dec 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-24 + * change command "sai2sam_pe" to "sampe" + * print usage for sampe command + * in sampe: change default max_occ to 1000 + * fixed a few compiling warnings in bntseq.c + +------------------------------------------------------------------------ +r728 | lh3 | 2008-12-27 07:14:59 -0500 (Sat, 27 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-22 + * mating information can be printed to SAM + +------------------------------------------------------------------------ +r727 | lh3 | 2008-12-26 18:10:59 -0500 (Fri, 26 Dec 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-21 + * implement pairing (still UNFINISHED) + * output all reads even if full of N + +------------------------------------------------------------------------ +r726 | lh3 | 2008-12-26 13:31:27 -0500 (Fri, 26 Dec 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.2.0-20 + * remove "-t" from aln cmd + * code clean up: move some functions in bwt2fmv.c to other source files + * added sai2sam_pe cmd: *UNFINISHED* + +------------------------------------------------------------------------ +r725 | lh3 | 2008-12-26 07:04:11 -0500 (Fri, 26 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwase.c + A /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/kseq.h + A /branches/prog/bwa/ksort.h (from /branches/prog/sclib/ksort/ksort.h:712) + A /branches/prog/bwa/kvec.h (from /branches/prog/sclib/kvec/kvec.h:537) + M /branches/prog/bwa/main.c + + * bwa-0.2.0-19 + * considerable code cleanup; no actual changes + +------------------------------------------------------------------------ +r724 | lh3 | 2008-12-25 11:32:11 -0500 (Thu, 25 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-18 + * generate SAM output + +------------------------------------------------------------------------ +r723 | lh3 | 2008-12-25 10:48:31 -0500 (Thu, 25 Dec 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.2.0-17 + * remove bwtsw2 related codes + * separate searching for SA interval from generating alignments + +------------------------------------------------------------------------ +r722 | lh3 | 2008-12-25 08:57:13 -0500 (Thu, 25 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + D /branches/prog/bwa/bwt_lite.c + D /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtgap.c + D /branches/prog/bwa/bwtsw2.h + D /branches/prog/bwa/bwtsw2_aux.c + D /branches/prog/bwa/bwtsw2_core.c + D /branches/prog/bwa/bwtsw2_main.c + D /branches/prog/bwa/khash.h + D /branches/prog/bwa/ksort.h + D /branches/prog/bwa/kvec.h + M /branches/prog/bwa/main.c + + * added interface to "aln -t" + * remove bwtsw2 related codes + +------------------------------------------------------------------------ +r666 | lh3 | 2008-11-18 18:34:29 -0500 (Tue, 18 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-16 + * allow to set max mismatches based on read length, but I do not know + whether this really works + +------------------------------------------------------------------------ +r665 | lh3 | 2008-11-18 08:34:03 -0500 (Tue, 18 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-15 + * fixed a bug in sequence parser. + +------------------------------------------------------------------------ +r612 | lh3 | 2008-10-28 06:50:53 -0400 (Tue, 28 Oct 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/utils.c + + * bwa-0.2.0-14 + * fixed a bug caused by the change of the FASTA/Q parser + +------------------------------------------------------------------------ +r611 | lh3 | 2008-10-28 06:24:56 -0400 (Tue, 28 Oct 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/kseq.h + D /branches/prog/bwa/seq.c + D /branches/prog/bwa/seq.h + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +replace seq.* with kseq.h + +------------------------------------------------------------------------ +r610 | lh3 | 2008-10-27 13:00:04 -0400 (Mon, 27 Oct 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-13 + * make bwtsw2 output sub-optimal hits. not completed + +------------------------------------------------------------------------ +r609 | lh3 | 2008-10-24 16:52:00 -0400 (Fri, 24 Oct 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/kvec.h + +little... + +------------------------------------------------------------------------ +r532 | lh3 | 2008-09-19 05:28:45 -0400 (Fri, 19 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/khash.h + +improve interface of khash + +------------------------------------------------------------------------ +r531 | lh3 | 2008-09-18 06:52:59 -0400 (Thu, 18 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +improve minor things, which make bwtsw2 slower, but should miss less true hits + +------------------------------------------------------------------------ +r530 | lh3 | 2008-09-17 18:19:26 -0400 (Wed, 17 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * fixed a bug in calculating ->D + * enforce band-width checking + +------------------------------------------------------------------------ +r529 | lh3 | 2008-09-17 18:06:49 -0400 (Wed, 17 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +delete a line of code that is never visited + +------------------------------------------------------------------------ +r528 | lh3 | 2008-09-17 17:58:51 -0400 (Wed, 17 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +a bit code clean up + +------------------------------------------------------------------------ +r527 | lh3 | 2008-09-17 10:55:45 -0400 (Wed, 17 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-12 + * max-depth can be set, although it does not help the speed at all + +------------------------------------------------------------------------ +r526 | lh3 | 2008-09-16 17:59:36 -0400 (Tue, 16 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +cut_tail after remove duplicate + +------------------------------------------------------------------------ +r525 | lh3 | 2008-09-16 17:56:11 -0400 (Tue, 16 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-11 + * improved cut_tail() + +------------------------------------------------------------------------ +r524 | lh3 | 2008-09-15 16:53:22 -0400 (Mon, 15 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-10 + * fixed a bug in cut_tail() + +------------------------------------------------------------------------ +r518 | lh3 | 2008-09-15 04:35:59 -0400 (Mon, 15 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +a bit code clean up + +------------------------------------------------------------------------ +r517 | lh3 | 2008-09-14 18:18:11 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +improve speed (<1%) + +------------------------------------------------------------------------ +r516 | lh3 | 2008-09-14 18:08:55 -0400 (Sun, 14 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * fixed two potential bugs, although I have not seen their effects + * improve speed a bit (<2%) + +------------------------------------------------------------------------ +r515 | lh3 | 2008-09-14 17:26:49 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + +nothing, really + +------------------------------------------------------------------------ +r514 | lh3 | 2008-09-14 17:10:13 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +disable X-drop, which has to be reimplemented in the current algorithm + +------------------------------------------------------------------------ +r513 | lh3 | 2008-09-14 16:49:42 -0400 (Sun, 14 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + + * temporarily disable cut_tail() + * calculate SA in bwt_lite.c + * fixed a bug in reversing the sequence + +------------------------------------------------------------------------ +r512 | lh3 | 2008-09-13 17:35:40 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + A /branches/prog/bwa/ksort.h + +n-best method + +------------------------------------------------------------------------ +r507 | lh3 | 2008-09-13 09:06:54 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + +give correct result again + +------------------------------------------------------------------------ +r506 | lh3 | 2008-09-13 08:12:07 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +I think I know the reason. It needs more work... + +------------------------------------------------------------------------ +r505 | lh3 | 2008-09-13 06:20:43 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + +fixed another bug, but still have + +------------------------------------------------------------------------ +r504 | lh3 | 2008-09-12 18:13:37 -0400 (Fri, 12 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +fixed another bug + +------------------------------------------------------------------------ +r503 | lh3 | 2008-09-12 17:15:56 -0400 (Fri, 12 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/khash.h + + * do not segfault, but the result is WRONG! + * prepare to remove bsw2_connectivity_check() + +------------------------------------------------------------------------ +r502 | lh3 | 2008-09-12 15:52:41 -0400 (Fri, 12 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/kvec.h + +more revisions + +------------------------------------------------------------------------ +r501 | lh3 | 2008-09-11 18:06:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +further simply codes with kvec.h + +------------------------------------------------------------------------ +r500 | lh3 | 2008-09-11 17:42:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +part of revisions... have not finished + +------------------------------------------------------------------------ +r499 | lh3 | 2008-09-11 17:24:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + A /branches/prog/bwa/kvec.h + +prepare for abrupt change + +------------------------------------------------------------------------ +r496 | lh3 | 2008-09-11 10:34:38 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +fixed a bug; now "bwtsw2 -d" is useless + +------------------------------------------------------------------------ +r495 | lh3 | 2008-09-11 09:22:03 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + +improve speed a little bit + +------------------------------------------------------------------------ +r494 | lh3 | 2008-09-11 08:28:08 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +remove debug codes + +------------------------------------------------------------------------ +r493 | lh3 | 2008-09-11 07:49:53 -0400 (Thu, 11 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * improve the speed a little bit (<5%) + * prepare to remove BSW_DEBUG + +------------------------------------------------------------------------ +r492 | lh3 | 2008-09-11 06:15:56 -0400 (Thu, 11 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-9 + * support reverse strand + * fixed a bug that causes missing hits + +------------------------------------------------------------------------ +r491 | lh3 | 2008-09-11 05:46:16 -0400 (Thu, 11 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-8 + * better progress report + +------------------------------------------------------------------------ +r490 | lh3 | 2008-09-10 17:04:49 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-7 + * avoid some missing hits + * add maximum depth + +------------------------------------------------------------------------ +r489 | lh3 | 2008-09-10 11:51:13 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-6 + * bwtsw2 works although on the forward strand only for now + * better progress information + +------------------------------------------------------------------------ +r488 | lh3 | 2008-09-10 10:21:53 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * implement memory pool + * avoid some rehashing + +------------------------------------------------------------------------ +r487 | lh3 | 2008-09-10 09:23:38 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + + * fixed a memory leak + * prepare to implement mempool + +------------------------------------------------------------------------ +r486 | lh3 | 2008-09-10 09:10:09 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + + * add X-dropoff + * remove duplicated results + * switch to simple stack + +------------------------------------------------------------------------ +r485 | lh3 | 2008-09-10 06:31:20 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + + * check whether t-node has been visited + * prepare to remove two-level stack + +------------------------------------------------------------------------ +r484 | lh3 | 2008-09-10 05:00:57 -0400 (Wed, 10 Sep 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/khash.h + +khash library + +------------------------------------------------------------------------ +r483 | lh3 | 2008-09-10 04:22:53 -0400 (Wed, 10 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +add inline + +------------------------------------------------------------------------ +r482 | lh3 | 2008-09-09 16:34:57 -0400 (Tue, 09 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + +improve speed + +------------------------------------------------------------------------ +r481 | lh3 | 2008-09-09 13:13:00 -0400 (Tue, 09 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +Use a 128bit hash table to keep all (tk,tl,qk,ql). This is slow. Just +keep a copy in case I may need this in future. + + +------------------------------------------------------------------------ +r480 | lh3 | 2008-09-09 12:53:32 -0400 (Tue, 09 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_core.c + + * no principal modification + +------------------------------------------------------------------------ +r479 | lh3 | 2008-09-09 11:01:45 -0400 (Tue, 09 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + + * fixed a bug which may cause duplicated matching + * accelerate the speed a bit, although using hash in avoiding duplications + slows the speed down in the end + +------------------------------------------------------------------------ +r474 | lh3 | 2008-09-03 17:22:57 -0400 (Wed, 03 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-5 + * indel seems to work on toy example + * add band + +------------------------------------------------------------------------ +r469 | lh3 | 2008-09-01 09:18:45 -0400 (Mon, 01 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/is.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/simple_dp.c + + * bwa-0.2.0-4 + * updated bwtsw2, which seems to work properly on toy examples + +------------------------------------------------------------------------ +r447 | lh3 | 2008-08-27 10:05:09 -0400 (Wed, 27 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-3 + * tune for longer gaps, but it does not really work with kilo-bp gaps... + +------------------------------------------------------------------------ +r446 | lh3 | 2008-08-26 13:30:41 -0400 (Tue, 26 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-2 + * changed the way to extend long deletions. Now use max_del_occ. + +------------------------------------------------------------------------ +r445 | lh3 | 2008-08-26 13:05:58 -0400 (Tue, 26 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + +updated from bwtsw2_lite + +------------------------------------------------------------------------ +r436 | lh3 | 2008-08-23 12:28:44 -0400 (Sat, 23 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt_lite.c + A /branches/prog/bwa/bwt_lite.h + A /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-1 + * add bwt_lite: a light-weighted version of bwt (NOT TESTED!) + * add core codes for bwtsw2: NOT TESTED!!! + +------------------------------------------------------------------------ +r427 | lh3 | 2008-08-15 05:38:12 -0400 (Fri, 15 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + +Release bwa-0.2.0 + +------------------------------------------------------------------------ +r426 | lh3 | 2008-08-14 11:26:19 -0400 (Thu, 14 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.1.6-7 + * change default seed length to 31 + * add incomplete support to color sequences (not tested yet!) + +------------------------------------------------------------------------ +r425 | lh3 | 2008-08-14 06:23:11 -0400 (Thu, 14 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-6 + * change default seed length to 33bp + +------------------------------------------------------------------------ +r424 | lh3 | 2008-08-14 05:55:33 -0400 (Thu, 14 Aug 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-5 + * fixed a bug that may miss true alignments. this bugs exists in most + early versions. + * fixed a bug that yields wrong coordinates for reads mapped on the forward + strands with gaps. + +------------------------------------------------------------------------ +r423 | lh3 | 2008-08-14 04:07:28 -0400 (Thu, 14 Aug 2008) | 2 lines +Changed paths: + D /branches/prog/bwa/Makefile.div + +useless + +------------------------------------------------------------------------ +r422 | lh3 | 2008-08-13 19:21:14 -0400 (Wed, 13 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-4 + * fixed one bug + * there is another one... + +------------------------------------------------------------------------ +r421 | lh3 | 2008-08-13 18:23:33 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-3 + * almost there, but not quite right + +------------------------------------------------------------------------ +r419 | lh3 | 2008-08-13 17:27:02 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * improve the seeding method + * prepare to load two BWTs into memory. A BIG change! + +------------------------------------------------------------------------ +r418 | lh3 | 2008-08-13 10:56:54 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * added seeding + * unfinished yet + +------------------------------------------------------------------------ +r413 | lh3 | 2008-08-08 11:48:35 -0400 (Fri, 08 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.6 + +------------------------------------------------------------------------ +r410 | lh3 | 2008-08-06 15:48:22 -0400 (Wed, 06 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/simple_dp.c + +sw: output alignment score + +------------------------------------------------------------------------ +r407 | lh3 | 2008-08-04 10:01:20 -0400 (Mon, 04 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + A /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.1.5-3 + * added a simple interface to SW/NW alignment + * stdaln-0.9.8 (see header for more details) + +------------------------------------------------------------------------ +r406 | lh3 | 2008-08-01 19:21:59 -0400 (Fri, 01 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + A /branches/prog/bwa/stdaln.c + A /branches/prog/bwa/stdaln.h + + * bwa-0.1.5-2 + * give accurate gap positions + +------------------------------------------------------------------------ +r405 | lh3 | 2008-08-01 19:06:19 -0400 (Fri, 01 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + +unfinished, but I am tired... + +------------------------------------------------------------------------ +r401 | lh3 | 2008-07-30 05:59:24 -0400 (Wed, 30 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + + * bwa-0.1.5-1 + * fixed a potential bug which may produce an alignment in N regions, + although extremely rare. + +------------------------------------------------------------------------ +r399 | lh3 | 2008-07-27 11:41:52 -0400 (Sun, 27 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.5 + +------------------------------------------------------------------------ +r398 | lh3 | 2008-07-25 12:14:47 -0400 (Fri, 25 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r397 | lh3 | 2008-07-25 09:58:56 -0400 (Fri, 25 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * + +------------------------------------------------------------------------ +r396 | lh3 | 2008-07-25 06:42:01 -0400 (Fri, 25 Jul 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.4-4 + * add timer for debugging + +------------------------------------------------------------------------ +r395 | lh3 | 2008-07-24 05:46:21 -0400 (Thu, 24 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.4-3 + * fixed a bug in the previous code + * this version gives identical result to bwa-0.1.4, just 10% faster + +------------------------------------------------------------------------ +r394 | lh3 | 2008-07-24 05:18:53 -0400 (Thu, 24 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.4-2 + * further improve the speed + * The result is slightly different from bwa-0.1.4 now. I need to check... + +------------------------------------------------------------------------ +r393 | lh3 | 2008-07-23 12:04:16 -0400 (Wed, 23 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + +comments only + +------------------------------------------------------------------------ +r392 | lh3 | 2008-07-23 10:34:03 -0400 (Wed, 23 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + +further improve the speed in Occ functions + +------------------------------------------------------------------------ +r386 | lh3 | 2008-07-22 10:03:54 -0400 (Tue, 22 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.4 + +------------------------------------------------------------------------ +r385 | lh3 | 2008-07-22 09:44:50 -0400 (Tue, 22 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +update documentation and ChangeLog + +------------------------------------------------------------------------ +r384 | lh3 | 2008-07-22 08:50:03 -0400 (Tue, 22 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.3-2 + * fixed the bug in the last modification + * now the alignment should be more clearly defined + +------------------------------------------------------------------------ +r383 | lh3 | 2008-07-21 18:32:21 -0400 (Mon, 21 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.3-1 + * this is a buggy verion! + * i will fix the bug tomorrow. It is late... + +------------------------------------------------------------------------ +r381 | lh3 | 2008-07-21 06:45:32 -0400 (Mon, 21 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.3 + +------------------------------------------------------------------------ +r380 | lh3 | 2008-07-21 06:07:43 -0400 (Mon, 21 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-3 + * improve the speed for gcc on Intel Mac OS X, but not really on icc on Linux + * aln: more command-line options + +------------------------------------------------------------------------ +r373 | lh3 | 2008-07-17 09:09:46 -0400 (Thu, 17 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-2 + * further improve the speed + * this version gives exactly the same result as bwa-0.1.2 + +------------------------------------------------------------------------ +r372 | lh3 | 2008-07-17 07:51:08 -0400 (Thu, 17 Jul 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-1 + * speed up by about 5% + +------------------------------------------------------------------------ +r370 | lh3 | 2008-07-17 05:12:00 -0400 (Thu, 17 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.2 + +------------------------------------------------------------------------ +r368 | lh3 | 2008-07-16 08:51:25 -0400 (Wed, 16 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + D /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + D /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-9 + * some code cleanup + * remove 1away and top2 + +------------------------------------------------------------------------ +r367 | lh3 | 2008-07-16 08:24:34 -0400 (Wed, 16 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/is.c + +Yuta Mori's implementation of IS algorithm. + +------------------------------------------------------------------------ +r365 | lh3 | 2008-07-16 06:58:04 -0400 (Wed, 16 Jul 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.1-8 + * improve gapped alignment + * this version will miss more gapped alignments, but the speed is much faster + * prepare to remove top2 and 1away algorithms + * prepare to add SAIS algorithm for bwt construction + +------------------------------------------------------------------------ +r358 | lh3 | 2008-06-09 06:03:04 -0400 (Mon, 09 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-7 + * change END_SKIP from 3 to 5, but still gaps may be wrongly added + * change default '-g' from 5 to 3 + +------------------------------------------------------------------------ +r357 | lh3 | 2008-06-09 05:18:36 -0400 (Mon, 09 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-6 + * fix a bug in nested stack + +------------------------------------------------------------------------ +r356 | lh3 | 2008-06-08 18:43:13 -0400 (Sun, 08 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + A /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.1-5 + * replace heap with nested stacks + * there are still obvious bugs... + +------------------------------------------------------------------------ +r355 | lh3 | 2008-06-08 17:13:44 -0400 (Sun, 08 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.1.1-4 + * add interface to affine gap alignment + * there are obvious bugs and I will fix them later + +------------------------------------------------------------------------ +r354 | lh3 | 2008-06-08 15:39:05 -0400 (Sun, 08 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-3 + * affine gap seems to work, at least partially + +------------------------------------------------------------------------ +r353 | lh3 | 2008-06-08 09:27:18 -0400 (Sun, 08 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + A /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-2 + * initial gapped alignment. not work at the moment + +------------------------------------------------------------------------ +r352 | lh3 | 2008-06-06 04:37:34 -0400 (Fri, 06 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-1 + * ungap: remove a useless varible in top2_entry_t + +------------------------------------------------------------------------ +r348 | lh3 | 2008-06-03 09:04:12 -0400 (Tue, 03 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + A /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.1 + +------------------------------------------------------------------------ +r347 | lh3 | 2008-06-03 05:45:08 -0400 (Tue, 03 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r346 | lh3 | 2008-06-02 18:59:50 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + A /branches/prog/bwa/ChangeLog + A /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-11 + * improve approximating mapping qualities + * add documentation + * add ChangeLog + +------------------------------------------------------------------------ +r345 | lh3 | 2008-06-02 16:04:39 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-10 + * output a random position for repetitive reads + +------------------------------------------------------------------------ +r344 | lh3 | 2008-06-02 15:03:54 -0400 (Mon, 02 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/pac2bwt.c + + * bwa-0.1.0-9 + * fix memory leaks + * fix a potential bug in coverting to the real coordinate + +------------------------------------------------------------------------ +r343 | lh3 | 2008-06-02 13:44:51 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-8 + * fix a bug about strand + * update Makefile.div + * change top2b as the default method + +------------------------------------------------------------------------ +r342 | lh3 | 2008-06-02 11:23:26 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-7 + * use bwt_2occ() and bwt_2occ4() in other functions + +------------------------------------------------------------------------ +r341 | lh3 | 2008-06-02 09:31:39 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-6 + * fix a bug for missing hits + +------------------------------------------------------------------------ +r340 | lh3 | 2008-06-02 09:10:18 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-5 + * accelerate comparisons in heap, a bit + +------------------------------------------------------------------------ +r339 | lh3 | 2008-06-02 08:41:31 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-4 + * avoid marginal repeated calculation in occ + +------------------------------------------------------------------------ +r338 | lh3 | 2008-06-02 06:46:51 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-3 + * fix a bug caused by previours change + * fix a bug in heap + * order the heap by more criteria + +------------------------------------------------------------------------ +r337 | lh3 | 2008-06-01 19:11:15 -0400 (Sun, 01 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-2 + * also sort sa range in heapsort, in attempt to improve cache performance. + Unfortunately, it does not work well at all. + +------------------------------------------------------------------------ +r336 | lh3 | 2008-06-01 17:45:23 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + + * 0.1.0-1 + * fix a bug in calculating the real coordinate + +------------------------------------------------------------------------ +r335 | lh3 | 2008-06-01 16:03:09 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + +nothing, really + +------------------------------------------------------------------------ +r334 | lh3 | 2008-06-01 15:59:13 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/pac2bwt.c + +use IS algorithm by default + +------------------------------------------------------------------------ +r333 | lh3 | 2008-06-01 15:05:15 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/is.c + M /branches/prog/bwa/pac2bwt.c + + * a bit code clean up in is.c + * add IS algorithm for constructing BWT, albeit slower + +------------------------------------------------------------------------ +r332 | lh3 | 2008-06-01 13:23:08 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/is.c + +IS linear-time algorithm for constructing SA/BWT + +------------------------------------------------------------------------ +r331 | lh3 | 2008-06-01 10:35:26 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * fix a bug in generating .pac + * index in one go + +------------------------------------------------------------------------ +r330 | lh3 | 2008-06-01 09:17:05 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + +real coordinates can be ouput + +------------------------------------------------------------------------ +r329 | lh3 | 2008-05-31 19:21:02 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwttop2.c + +add top2e which is similar to 1away + +------------------------------------------------------------------------ +r328 | lh3 | 2008-05-31 18:46:12 -0400 (Sat, 31 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * unified cmd-line interface for ungapped alignment + * add two alternatives to top2 algorithm + +------------------------------------------------------------------------ +r327 | lh3 | 2008-05-31 18:14:46 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +add cmd-line interface to alntop2 + +------------------------------------------------------------------------ +r326 | lh3 | 2008-05-31 17:59:31 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + A /branches/prog/bwa/bwttop2.c + +top2 algorithm seems to work. I need to change interface, though + +------------------------------------------------------------------------ +r325 | lh3 | 2008-05-31 15:11:49 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt1away.c + +change the variable in the structure + +------------------------------------------------------------------------ +r324 | lh3 | 2008-05-31 14:52:13 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt1away.c + +set a slightly better bound on the maximum allowed mismatches + +------------------------------------------------------------------------ +r323 | lh3 | 2008-05-30 18:40:21 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + + * output time statistics + +------------------------------------------------------------------------ +r322 | lh3 | 2008-05-30 17:58:25 -0400 (Fri, 30 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + + * presumably better way to make use of prefix. But for the moment I do + not know whether it is correct or not. + * a bit code clean up: separate alignment part + +------------------------------------------------------------------------ +r321 | lh3 | 2008-05-30 13:57:43 -0400 (Fri, 30 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt_gen/Makefile + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * a bit code clean up + * put bwt_gen in bwa + +------------------------------------------------------------------------ +r320 | lh3 | 2008-05-30 11:40:11 -0400 (Fri, 30 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + + * improve cmd-line interface + * fix a bug in loading .sa + * change default sa interval to 32 + +------------------------------------------------------------------------ +r319 | lh3 | 2008-05-30 10:31:37 -0400 (Fri, 30 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + + * fix memory leak (I know that. Just a bit lazy) + * change to another method to do 1-away alignment + +------------------------------------------------------------------------ +r318 | lh3 | 2008-05-30 09:21:49 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +best unique match is partially finished + +------------------------------------------------------------------------ +r317 | lh3 | 2008-05-30 06:33:28 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +remove "ungapped" command and related codes + +------------------------------------------------------------------------ +r316 | lh3 | 2008-05-30 06:05:20 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + +change variable name thick to width + +------------------------------------------------------------------------ +r315 | lh3 | 2008-05-29 19:06:13 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + +revised algorithm for ungapped alignment. the old one can still be used. + +------------------------------------------------------------------------ +r314 | lh3 | 2008-05-29 16:36:11 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/pac2bwt.c + + * make commands more independent, but ungapped does not work at the moment + +------------------------------------------------------------------------ +r313 | lh3 | 2008-05-29 15:56:14 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt_gen/bwt_gen.c + +little... + +------------------------------------------------------------------------ +r312 | lh3 | 2008-05-29 15:54:01 -0400 (Thu, 29 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwt_gen/bwt_gen.h + + * add CopyRight information from the original codes + * do not dump .fmv files + +------------------------------------------------------------------------ +r311 | lh3 | 2008-05-29 15:44:36 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/bwt_gen + A /branches/prog/bwa/bwt_gen/Makefile + A /branches/prog/bwa/bwt_gen/QSufSort.c + A /branches/prog/bwa/bwt_gen/QSufSort.h + A /branches/prog/bwa/bwt_gen/bwt_gen.c + A /branches/prog/bwa/bwt_gen/bwt_gen.h + +codes from BWT-SW, for building BWT from packed file + +------------------------------------------------------------------------ +r310 | lh3 | 2008-05-28 17:03:35 -0400 (Wed, 28 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * change OCC_INTERVAL to 0x40, which makes bwa twice as fast. + * write Occ file as ".occ" as it is using a different interval from + .fmv, the BWT-SW correspondance of .occ + +------------------------------------------------------------------------ +r309 | lh3 | 2008-05-28 11:39:37 -0400 (Wed, 28 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + +fix a bug + +------------------------------------------------------------------------ +r308 | lh3 | 2008-05-28 09:56:16 -0400 (Wed, 28 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + +add heuristics to improve the speed, but I have not tested whether the +results are correct or not. + + +------------------------------------------------------------------------ +r307 | lh3 | 2008-05-28 06:31:34 -0400 (Wed, 28 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * make ungapped alignment basically works... + * but it is very slow in comparison to others... + * also I need to improve the interface... + * a lot of things to keep me busy today... + +------------------------------------------------------------------------ +r306 | lh3 | 2008-05-27 18:41:27 -0400 (Tue, 27 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtaln.c + + * remove recursion + * fixed a bug in bwt_occ() + +------------------------------------------------------------------------ +r305 | lh3 | 2008-05-27 16:59:44 -0400 (Tue, 27 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtaln.c + + * bwa now tells whether a sequenced can be mapped with maximum allowed + mismatches. ONLY ungapped. + * this is a recursive version. I will remove recursion later. + + +------------------------------------------------------------------------ +r304 | lh3 | 2008-05-27 09:12:17 -0400 (Tue, 27 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + A /branches/prog/bwa/bwtaln.c + A /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/utils.c + + * load .sa and .fmv files + * exact alignment now works + +------------------------------------------------------------------------ +r303 | lh3 | 2008-05-27 06:33:38 -0400 (Tue, 27 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +add xassert and fix a bug + +------------------------------------------------------------------------ +r302 | lh3 | 2008-05-27 06:23:20 -0400 (Tue, 27 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtio.c + A /branches/prog/bwa/utils.c + A /branches/prog/bwa/utils.h + +improve error message and error handling + +------------------------------------------------------------------------ +r301 | lh3 | 2008-05-27 05:37:51 -0400 (Tue, 27 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + A /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * move I/O codes to bwtio.c + * SA can be dumped and interestingly, it is identical to BWTSW + * now, .fmv is still different from BWTSW + +------------------------------------------------------------------------ +r299 | lh3 | 2008-05-26 18:07:44 -0400 (Mon, 26 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + +generate/retrieve SA and Occ + +------------------------------------------------------------------------ +r298 | lh3 | 2008-05-26 13:16:49 -0400 (Mon, 26 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + + * retrieve occ value at any position + * move bwt_cal_occ() to bwt.c + +------------------------------------------------------------------------ +r297 | lh3 | 2008-05-25 17:43:58 -0400 (Sun, 25 May 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwt.c + A /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * add bwt2fmv. It works to some extend. However, I do not understand + the purpose of some weird codes in BWT-SW. As a consequence, bwt2fmv + could generate a file almost identical, but not exactly identical, to + the .fmv file from BWT-SW. + + +------------------------------------------------------------------------ +r296 | lh3 | 2008-05-24 18:35:02 -0400 (Sat, 24 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + A /branches/prog/bwa/pac2bwt.c + +Burrows-Wheeler Transform now works. At least on one example, the +current code generates the same BWT as BWT-SW. Kind of magical, I would +say. :) + + +------------------------------------------------------------------------ +r295 | lh3 | 2008-05-24 11:25:31 -0400 (Sat, 24 May 2008) | 3 lines +Changed paths: + A /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/main.c + A /branches/prog/bwa/main.h + + * add Makefile and main.* + * improve interface to fa2bns, a bit + +------------------------------------------------------------------------ +r293 | lh3 | 2008-05-24 10:57:03 -0400 (Sat, 24 May 2008) | 3 lines +Changed paths: + A /branches/prog/bwa + A /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/bntseq.h + A /branches/prog/bwa/seq.c + A /branches/prog/bwa/seq.h + + * Burrow-Wheeler Alignment + * initial codes + +------------------------------------------------------------------------ diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/Makefile Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,49 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 +CXXFLAGS= $(CFLAGS) +AR= ar +DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ + bwaseqio.o bwase.o kstring.o +AOBJS= QSufSort.o bwt_gen.o \ + is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ + bwape.o cs2nt.o \ + bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ + bwtsw2_chain.o fastmap.o bwtsw2_pair.o +PROG= bwa +INCLUDES= +LIBS= -lm -lz -lpthread +SUBDIRS= . + +.SUFFIXES:.c .o .cc + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ +.cc.o: + $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + +libbwa.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +bwa.o:bwa.h + +QSufSort.o:QSufSort.h + +bwt.o:bwt.h +bwtio.o:bwt.h +bwtaln.o:bwt.h bwtaln.h kseq.h +bntseq.o:bntseq.h +bwtgap.o:bwtgap.h bwtaln.h bwt.h + +bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_main.o:bwtsw2.h + +clean: + rm -f gmon.out *.o a.out $(PROG) *~ *.a diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/NEWS --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/NEWS Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,658 @@ +Release 0.6.2 (19 June, 2012) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is largely a bug-fix release. Notable changes in BWA-short and BWA-SW: + + * Bugfix: BWA-SW may give bad alignments due to incorrect band width. + + * Bugfix: A segmentation fault due to an out-of-boundary error. The fix is a + temporary solution. The real cause has not been identified. + + * Attempt to read index from prefix.64.bwt, such that the 32-bit and 64-bit + index can coexist. + + * Added options '-I' and '-S' to control BWA-SW pairing. + +(0.6.2: 19 June 2012, r126) + + + +Release 0.6.1 (28 November, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes to BWA-short: + + * Bugfix: duplicated alternative hits in the XA tag. + + * Bugfix: when trimming enabled, bwa-aln trims 1bp less. + + * Disabled the color-space alignment. 0.6.x is not working with SOLiD reads at + present. + +Notable changes to BWA-SW: + + * Bugfix: segfault due to excessive ambiguous bases. + + * Bugfix: incorrect mate position in the SE mode. + + * Bugfix: rare segfault in the PE mode + + * When macro _NO_SSE2 is in use, fall back to the standard Smith-Waterman + instead of SSE2-SW. + + * Optionally mark split hits with lower alignment scores as secondary. + +Changes to fastmap: + + * Bugfix: infinite loop caused by ambiguous bases. + + * Optionally output the query sequence. + +(0.6.1: 28 November 2011, r104) + + + +Release 0.5.10 and 0.6.0 (12 November, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 0.6.0 release comes with two major changes. Firstly, the index data +structure has been changed to support genomes longer than 4GB. The forward and +reverse backward genome is now integrated in one index. This change speeds up +BWA-short by about 20% and BWA-SW by 90% with the mapping acccuracy largely +unchanged. A tradeoff is BWA requires more memory, but this is the price almost +all mappers that index the genome have to pay. + +Secondly, BWA-SW in 0.6.0 now works with paired-end data. It is more accurate +for highly unique reads and more robust to long indels and structural +variations. However, BWA-short still has edges for reads with many suboptimal +hits. It is yet to know which algorithm is the best for variant calling. + +0.5.10 is a bugfix release only and is likely to be the last release in the 0.5 +branch unless I find critical bugs in future. + +Other notable changes: + + * Added the `fastmap' command that finds super-maximal exact matches. It does + not give the final alignment, but runs much faster. It can be a building + block for other alignment algorithms. [0.6.0 only] + + * Output the timing information before BWA exits. This also tells users that + the task has been finished instead of being killed or aborted. [0.6.0 only] + + * Sped up multi-threading when using many (>20) CPU cores. + + * Check I/O error. + + * Increased the maximum barcode length to 63bp. + + * Automatically choose the indexing algorithm. + + * Bugfix: very rare segfault due to an uninitialized variable. The bug also + affects the placement of suboptimal alignments. The effect is very minor. + +This release involves quite a lot of tricky changes. Although it has been +tested on a few data sets, subtle bugs may be still hidden. It is *NOT* +recommended to use this release in a production pipeline. In future, however, +BWA-SW may be better when reads continue to go longer. I would encourage users +to try the 0.6 release. I would also like to hear the users' experience. Thank +you. + +(0.6.0: 12 November 2011, r85) + + + +Beta Release 0.5.9 (24 January, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Feature: barcode support via the `-B' option. + + * Feature: Illumina 1.3+ read format support via the `-I' option. + + * Bugfix: RG tags are not attached to unmapped reads. + + * Bugfix: very rare bwasw mismappings + + * Recommend options for PacBio reads in bwasw help message. + + +Also, since January 13, the BWA master repository has been moved to github: + + https://github.com/lh3/bwa + +The revision number has been reset. All recent changes will be first +committed to this repository. + +(0.5.9: 24 January 2011, r16) + + + +Beta Release Candidate 0.5.9rc1 (10 December, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwasw: + + * Output unmapped reads. + + * For a repetitive read, choose a random hit instead of a fixed + one. This is not well tested. + +Notable changes in bwa-short: + + * Fixed a bug in the SW scoring system, which may lead to unexpected + gaps towards the end of a read. + + * Fixed a bug which invalidates the randomness of repetitive reads. + + * Fixed a rare memory leak. + + * Allowed to specify the read group at the command line. + + * Take name-grouped BAM files as input. + +Changes to this release are usually safe in that they do not interfere +with the key functionality. However, the release has only been tested on +small samples instead of on large-scale real data. If anything weird +happens, please report the bugs to the bio-bwa-help mailing list. + +(0.5.9rc1: 10 December 2010, r1561) + + + +Beta Release 0.5.8 (8 June, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwasw: + + * Fixed an issue of missing alignments. This should happen rarely and + only when the contig/read alignment is multi-part. Very rarely, bwasw + may still miss a segment in a multi-part alignment. This is difficult + to fix, although possible. + +Notable changes in bwa-short: + + * Discard the SW alignment when the best single-end alignment is much + better. Such a SW alignment may caused by structural variations and + forcing it to be aligned leads to false alignment. This fix has not + been tested thoroughly. It would be great to receive more users + feedbacks on this issue. + + * Fixed a typo/bug in sampe which leads to unnecessarily large memory + usage in some cases. + + * Further reduced the chance of reporting `weird pairing'. + +(0.5.8: 8 June 2010, r1442) + + + +Beta Release 0.5.7 (1 March, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release only has an effect on paired-end data with fat insert-size +distribution. Users are still recommended to update as the new release +improves the robustness to poor data. + + * The fix for `weird pairing' was not working in version 0.5.6, pointed + out by Carol Scott. It should work now. + + * Optionally output to a normal file rather than to stdout (by Tim + Fennel). + +(0.5.7: 1 March 2010, r1310) + + + +Beta Release 0.5.6 (10 Feburary, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwa-short: + + * Report multiple hits in the SAM format at a new tag XA encoded as: + (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has + 4 or fewer hits, they will all be reported; if a read in a anomalous + pair has 11 or fewer hits, all of them will be reported. + + * Perform Smith-Waterman alignment also for anomalous read pairs when + both ends have quality higher than 17. This reduces false positives + for some SV discovery algorithms. + + * Do not report "weird pairing" when the insert size distribution is + too fat or has a mean close to zero. + + * If a read is bridging two adjacent chromsomes, flag it as unmapped. + + * Fixed a small but long existing memory leak in paired-end mapping. + + * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly + parsed by solid2fastq.pl; b) truncated quality string is resolved; c) + SOLiD read mapped to the reverse strand is complemented. + + * Bwa now calculates skewness and kurtosis of the insert size + distribution. + + * Deploy a Bayesian method to estimate the maximum distance for a read + pair considered to be paired properly. The method is proposed by + Gerton Lunter, but bwa only implements a simplified version. + + * Export more functions for Java bindings, by Matt Hanna (See: + http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings) + + * Abstract bwa CIGAR for further extension, by Rodrigo Goya. + +(0.5.6: 10 Feburary 2010, r1303) + + + +Beta Release 0.5.5 (10 November, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a bug fix release: + + * Fixed a serious bug/typo in aln which does not occur given short + reads, but will lead to segfault for >500bp reads. Of course, the aln + command is not recommended for reads longer than 200bp, but this is a + bug anyway. + + * Fixed a minor bug/typo which leads to incorrect single-end mapping + quality when one end is moved to meet the mate-pair requirement. + + * Fixed a bug in samse for mapping in the color space. This bug is + caused by quality filtration added since 0.5.1. + +(0.5.5: 10 November 2009, r1273) + + + +Beta Release 0.5.4 (9 October, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since this version, the default seed length used in the "aln" command is +changed to 32. + +Notable changes in bwa-short: + + * Added a new tag "XC:i" which gives the length of clipped reads. + + * In sampe, skip alignments in case of a bug in the Smith-Waterman + alignment module. + + * In sampe, fixed a bug in pairing when the read sequence is identical + to its reverse complement. + + * In sampe, optionally preload the entire FM-index into memory to + reduce disk operations. + +Notable changes in dBWT-SW/BWA-SW: + + * Changed name dBWT-SW to BWA-SW. + + * Optionally use "hard clipping" in the SAM output. + +(0.5.4: 9 October 2009, r1245) + + + +Beta Release 0.5.3 (15 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fixed a critical bug in bwa-short: reads mapped to the reverse strand +are not complemented. + +(0.5.3: 15 September 2009, r1225) + + + +Beta Release 0.5.2 (13 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwa-short: + + * Optionally trim reads before alignment. See the manual page on `aln + -q' for detailed description. + + * Fixed a bug in calculating the NM tag for a gapped alignment. + + * Fixed a bug given a mixture of reads with some longer than the seed + length and some shorter. + + * Print SAM header. + +Notable changes in dBWT-SW: + + * Changed the default value of -T to 30. As a result, the accuracy is a + little higher for short reads at the cost of speed. + +(0.5.2: 13 September 2009, r1223) + + + +Beta Release 0.5.1 (2 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in the short read alignment component: + + * Fixed a bug in samse: do not write mate coordinates. + +Notable changes in dBWT-SW: + + * Randomly choose one alignment if the read is a repetitive. + + * Fixed a flaw when a read is mapped across two adjacent reference + sequences. However, wrong alignment reports may still occur rarely in + this case. + + * Changed the default band width to 50. The speed is slower due to this + change. + + * Improved the mapping quality a little given long query sequences. + +(0.5.1: 2 September 2009, r1209) + + + +Beta Release 0.5.0 (20 August, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a novel algorithm, dBWT-SW, specifically +designed for long reads. It is 10-50 times faster than SSAHA2, depending +on the characteristics of the input data, and achieves comparable +alignment accuracy while allowing chimera detection. In comparison to +BLAT, dBWT-SW is several times faster and much more accurate especially +when the error rate is high. Please read the manual page for more +information. + +The dBWT-SW algorithm is kind of developed for future sequencing +technologies which produce much longer reads with a little higher error +rate. It is still at its early development stage. Some features are +missing and it may be buggy although I have evaluated on several +simulated and real data sets. But following the "release early" +paradigm, I would like the users to try it first. + +Other notable changes in BWA are: + + * Fixed a rare bug in the Smith-Waterman alignment module. + + * Fixed a rare bug about the wrong alignment coordinate when a read is + poorly aligned. + + * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in + a pair are unmapped. + +(0.5.0: 20 August 2009, r1200) + + + +Beta Release 0.4.9 (19 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has +not in fact. Now I have fixed the bug. Sorry for this and thank Quan +Long for pointing out the bug (again). + +(0.4.9: 19 May 2009, r1075) + + + +Beta Release 0.4.8 (18 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One change to "aln -R". Now by default, if there are no more than `-R' +equally best hits, bwa will search for suboptimal hits. This change +affects the ability in finding SNPs in segmental duplications. + +I have not tested this option thoroughly, but this simple change is less +likely to cause new bugs. Hope I am right. + +(0.4.8: 18 May 2009, r1073) + + + +Beta Release 0.4.7 (12 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Output SM (single-end mapping quality) and AM (smaller mapping + quality among the two ends) tag from sam output. + + * Improved the functionality of stdsw. + + * Made the XN tag more accurate. + + * Fixed a very rare segfault caused by integer overflow. + + * Improve the insert size estimation. + + * Fixed compiling errors for some Linux systems. + +(0.4.7: 12 May 2009, r1066) + + + +Beta Release 0.4.6 (9 March, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release improves the SOLiD support. First, a script for converting +SOLiD raw data is provided. This script is adapted from solid2fastq.pl +in the MAQ package. Second, a nucleotide reference file can be directly +used with `bwa index'. Third, SOLiD paired-end support is +completed. Fourth, color-space reads will be converted to nucleotides +when SAM output is generated. Color errors are corrected in this +process. Please note that like MAQ, BWA cannot make use of the primer +base and the first color. + +In addition, the calculation of mapping quality is also improved a +little bit, although end-users may barely observe the difference. + +(0.4.6: 9 March 2009, r915) + + + +Beta Release 0.4.5 (18 Feburary, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Not much happened, but I think it would be good to let the users use the +latest version. + +Notable changes (Thank Bob Handsaker for catching the two bugs): + + * Improved bounary check. Previous version may still give incorrect + alignment coordinates in rare cases. + + * Fixed a bug in SW alignment when no residue matches. This only + affects the `sampe' command. + + * Robustly estimate insert size without setting the maximum on the + command line. Since this release `sampe -a' only has an effect if + there are not enough good pairs to infer the insert size + distribution. + + * Reduced false PE alignments a little bit by using the inferred insert + size distribution. This fix may be more important for long insert + size libraries. + +(0.4.5: 18 Feburary 2009, r829) + + + +Beta Release 0.4.4 (15 Feburary, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is mainly a bug fix release. Notable changes are: + + * Imposed boundary check for extracting subsequence from the + genome. Previously this causes memory problem in rare cases. + + * Fixed a bug in failing to find whether an alignment overlapping with + N on the genome. + + * Changed MD tag to meet the latest SAM specification. + +(0.4.4: 15 Feburary 2009, r815) + + + +Beta Release 0.4.3 (22 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Treat an ambiguous base N as a mismatch. Previous versions will not + map reads containing any N. + + * Automatically choose the maximum allowed number of differences. This + is important when reads of different lengths are mixed together. + + * Print mate coordinate if only one end is unmapped. + + * Generate MD tag. This tag encodes the mismatching positions and the + reference bases at these positions. Deletions from the reference will + also be printed. + + * Optionally dump multiple hits from samse, in another concise format + rather than SAM. + + * Optionally disable iterative search. This is VERY SLOOOOW, though. + + * Fixed a bug in generate SAM. + +(0.4.3: 22 January 2009, r787) + + + +Beta Release 0.4.2 (9 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if +there are no comment texts in the FASTA header. This is a critical +bug. Nothing else was changed. + +(0.4.2: 9 January 2009, r769) + + + +Beta Release 0.4.1 (7 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +I am sorry for the quick updates these days. I like to set a milestone +for BWA and this release seems to be. For paired end reads, BWA also +does Smith-Waterman alignment for an unmapped read whose mate can be +mapped confidently. With this strategy BWA achieves similar accuracy to +maq. Benchmark is also updated accordingly. + +(0.4.1: 7 January 2009, r760) + + + +Beta Release 0.4.0 (6 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In comparison to the release two days ago, this release is mainly tuned +for performance with some tricks I learnt from Bowtie. However, as the +indexing format has also been changed, I have to increase the version +number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with +`bwa index'. + + * Improved the speed by about 20%. + + * Added multi-threading to `bwa aln'. + +(0.4.0: 6 January 2009, r756) + + + +Beta Release 0.3.0 (4 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Added paired-end support by separating SA calculation and alignment + output. + + * Added SAM output. + + * Added evaluation to the documentation. + +(0.3.0: 4 January 2009, r741) + + + +Beta Release 0.2.0 (15 Augusst, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Take the subsequence at the 5'-end as seed. Seeding strategy greatly + improves the speed for long reads, at the cost of missing a few true + hits that contain many differences in the seed. Seeding also increase + the memory by 800MB. + + * Fixed a bug which may miss some gapped alignments. Fixing the bug + also slows the speed a little. + +(0.2.0: 15 August 2008, r428) + + + +Beta Release 0.1.6 (08 Augusst, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Give accurate CIGAR string. + + * Add a simple interface to SW/NW alignment + +(0.1.6: 08 August 2008, r414) + + + +Beta Release 0.1.5 (27 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Improve the speed. This version is expected to give the same results. + +(0.1.5: 27 July 2008, r400) + + + +Beta Release 0.1.4 (22 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Fixed a bug which may cause missing gapped alignments. + + * More clearly define what alignments can be found by BWA (See + manual). Now BWA runs a little slower because it will visit more + potential gapped alignments. + + * A bit code clean up. + +(0.1.4: 22 July 2008, r387) + + + +Beta Release 0.1.3 (21 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Improve the speed with some tricks on retrieving occurences. The results +should be exactly the same as that of 0.1.2. + +(0.1.3: 21 July 2008, r382) + + + +Beta Release 0.1.2 (17 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Support gapped alignment. Codes for ungapped alignment has been removed. + +(0.1.2: 17 July 2008, r371) + + + +Beta Release 0.1.1 (03 June, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is the first release of BWA, Burrows-Wheeler Alignment tool. Please +read man page for more information about this software. + +(0.1.1: 03 June 2008, r349) + + + diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/QSufSort.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/QSufSort.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,405 @@ +/* QSufSort.c + + Original source from qsufsort.c + + Copyright 1999, N. Jesper Larsson, all rights reserved. + + This file contains an implementation of the algorithm presented in "Faster + Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko + Sadakane (sada@is.s.u-tokyo.ac.jp). + + This software may be used freely for any purpose. However, when distributed, + the original source must be clearly stated, and, when the source code is + distributed, the copyright notice must be retained and any alterations in + the code must be clearly marked. No warranty is given regarding the quality + of this software. + + Modified by Wong Chi-Kwong, 2004 + + Changes summary: - Used long variable and function names + - Removed global variables + - Replace pointer references with array references + - Used insertion sort in place of selection sort and increased insertion sort threshold + - Reconstructing suffix array from inverse becomes an option + - Add handling where end-of-text symbol is not necessary < all characters + - Removed codes for supporting alphabet size > number of characters + + No warrenty is given regarding the quality of the modifications. + +*/ + + +#include +#include +#include +#include "QSufSort.h" + +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; + +// Static functions +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); + +/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size + n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original + contents of x[n] is disregarded, the n-th symbol being regarded as + end-of-string smaller than all other symbols.*/ +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform) +{ + qsint_t i, j; + qsint_t s, negatedSortedGroupLength; + qsint_t numSymbolAggregated; + qsint_t maxNumInputSymbol; + qsint_t numSortedPos = 1; + qsint_t newAlphabetSize; + + maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; + + if (!skipTransform) { + /* bucketing possible*/ + newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, + numChar, &numSymbolAggregated); + QSufSortBucketSort(V, I, numChar, newAlphabetSize); + I[0] = -1; + V[numChar] = 0; + numSortedPos = numSymbolAggregated; + } + + while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { + i = 0; + negatedSortedGroupLength = 0; + do { + s = I[i]; + if (s < 0) { + i -= s; /* skip over sorted group.*/ + negatedSortedGroupLength += s; + } else { + if (negatedSortedGroupLength) { + I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */ + negatedSortedGroupLength = 0; + } + j = V[s] + 1; + QSufSortSortSplit(V, I, i, j - 1, numSortedPos); + i = j; + } + } while (i <= numChar); + if (negatedSortedGroupLength) { + /* array ends with a sorted group.*/ + I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/ + } + numSortedPos *= 2; /* double sorted-depth.*/ + } +} + +void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) +{ + qsint_t i; + for (i=0; i<=numChar; i++) + I[V[i]] = i + 1; +} + +/* Sorting routine called for each unsorted group. Sorts the array of integers + (suffix numbers) of length n starting at p. The algorithm is a ternary-split + quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", + Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This + function is based on Program 7.*/ +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { + + qsint_t a, b, c, d; + qsint_t l, m; + qsint_t f, v, s, t; + qsint_t tmp; + qsint_t numItem; + + numItem = highestPos - lowestPos + 1; + + if (numItem <= INSERT_SORT_NUM_ITEM) { + QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar); + return; + } + + v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar); + + a = b = lowestPos; + c = d = highestPos; + + while (1) { + while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { + if (f == v) { + swap(I[a], I[b], tmp); + a++; + } + b++; + } + while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) { + if (f == v) { + swap(I[c], I[d], tmp); + d--; + } + c--; + } + if (b > c) + break; + swap(I[b], I[c], tmp); + b++; + c--; + } + + s = a - lowestPos; + t = b - a; + s = min(s, t); + for (l = lowestPos, m = b - s; m < b; l++, m++) { + swap(I[l], I[m], tmp); + } + + s = d - c; + t = highestPos - d; + s = min(s, t); + for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) { + swap(I[l], I[m], tmp); + } + + s = b - a; + t = d - c; + if (s > 0) + QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); + + // Update group number for equal portion + a = lowestPos + s; + b = highestPos - t; + if (a == b) { + // Sorted group + V[I[a]] = a; + I[a] = -1; + } else { + // Unsorted group + for (c=a; c<=b; c++) + V[I[c]] = b; + } + + if (t > 0) + QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); + +} + +/* Algorithm by Bentley & McIlroy.*/ +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { + + qsint_t m; + qsint_t keyl, keym, keyn; + qsint_t key1, key2, key3; + qsint_t s; + qsint_t numItem; + + numItem = highestPos - lowestPos + 1; + + m = lowestPos + numItem / 2; + + s = numItem / 8; + key1 = KEY(V, I, lowestPos, numSortedChar); + key2 = KEY(V, I, lowestPos+s, numSortedChar); + key3 = KEY(V, I, lowestPos+2*s, numSortedChar); + keyl = med3(key1, key2, key3); + key1 = KEY(V, I, m-s, numSortedChar); + key2 = KEY(V, I, m, numSortedChar); + key3 = KEY(V, I, m+s, numSortedChar); + keym = med3(key1, key2, key3); + key1 = KEY(V, I, highestPos-2*s, numSortedChar); + key2 = KEY(V, I, highestPos-s, numSortedChar); + key3 = KEY(V, I, highestPos, numSortedChar); + keyn = med3(key1, key2, key3); + + return med3(keyl, keym, keyn); + + +} + +/* Quadratic sorting method to use for small subarrays. */ +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) +{ + qsint_t i, j; + qsint_t tmpKey, tmpPos; + qsint_t numItem; + qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; + qsint_t negativeSortedLength; + qsint_t groupNum; + + numItem = highestPos - lowestPos + 1; + + for (i=0; i0 && key[j-1] > tmpKey; j--) { + key[j] = key[j-1]; + pos[j] = pos[j-1]; + } + key[j] = tmpKey; + pos[j] = tmpPos; + } + + negativeSortedLength = -1; + + i = numItem - 1; + groupNum = highestPos; + while (i > 0) { + I[i+lowestPos] = pos[i]; + V[I[i+lowestPos]] = groupNum; + if (key[i-1] == key[i]) { + negativeSortedLength = 0; + } else { + if (negativeSortedLength < 0) + I[i+lowestPos] = negativeSortedLength; + groupNum = i + lowestPos - 1; + negativeSortedLength--; + } + i--; + } + + I[lowestPos] = pos[0]; + V[I[lowestPos]] = groupNum; + if (negativeSortedLength < 0) + I[lowestPos] = negativeSortedLength; +} + +/* Bucketsort for first iteration. + + Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear + at least once. x[n] is 0. (This is the corresponding output of transform.) k + must be at most n+1. p is array of size n+1 whose contents are disregarded. + + Output: x is V and p is I after the initial sorting stage of the refined + suffix sorting algorithm.*/ + +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) +{ + qsint_t i, c; + qsint_t d; + qsint_t groupNum; + qsint_t currentIndex; + + // mark linked list empty + for (i=0; i0; i--) { + c = I[i-1]; + d = (qsint_t)(V[c]); + groupNum = currentIndex; + V[c] = groupNum; + if (d >= 0) { + I[currentIndex] = c; + while (d >= 0) { + c = d; + d = V[c]; + V[c] = groupNum; + currentIndex--; + I[currentIndex] = c; + } + } else { + // sorted group + I[currentIndex] = -1; + } + currentIndex--; + } +} + +/* Transforms the alphabet of x by attempting to aggregate several symbols into + one, while preserving the suffix order of x. The alphabet may also be + compacted, so that x on output comprises all integers of the new alphabet + with no skipped numbers. + + Input: x is an array of size n+1 whose first n elements are positive + integers in the range l...k-1. p is array of size n+1, used for temporary + storage. q controls aggregation and compaction by defining the maximum intue + for any symbol during transformation: q must be at least k-l; if q<=n, + compaction is guaranteed; if k-l>n, compaction is never done; if q is + INT_MAX, the maximum number of symbols are aggregated into one. + + Output: Returns an integer j in the range 1...q representing the size of the + new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is + set to the number of old symbols grouped into one. Only x[n] is 0.*/ +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) +{ + qsint_t c, i, j; + qsint_t a; // numSymbolAggregated + qsint_t mask; + qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; + qsint_t newAlphabetSize; + qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; + + maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; + + for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; + maxSymbol = QSINT_MAX >> maxNumBit; + + c = maxNumInputSymbol; + for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { + minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1); + maxSymbolInChunk = c; + c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol; + } + + mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ + V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ + + /* bucketing possible, compact alphabet.*/ + for (i=0; i<=maxSymbolInChunk; i++) + I[i] = 0; /* zero transformation table.*/ + c = minSymbolInChunk; + for (i=a; i<=numChar; i++) { + I[c] = 1; /* mark used chunk symbol.*/ + c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ + } + for (i=1; i number of characters + + No warrenty is given regarding the quality of the modifications. + +*/ + +#ifndef __QSUFSORT_H__ +#define __QSUFSORT_H__ + +#include + +#define KEY(V, I, p, h) ( V[ I[p] + h ] ) +#define INSERT_SORT_NUM_ITEM 16 + +typedef int64_t qsint_t; +#define QSINT_MAX INT64_MAX + +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform); +void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar); + + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/README Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,36 @@ +Released packages can be downloaded from SourceForge.net: + + http://sourceforge.net/projects/bio-bwa/files/ + +Introduction and FAQ are available at: + + http://bio-bwa.sourceforge.net + +Manual page at: + + http://bio-bwa.sourceforge.net/bwa.shtml + +Mailing list: + + bio-bwa-help@lists.sourceforge.net + +To sign up: + + http://sourceforge.net/mail/?group_id=276243 + +Publications (Open Access): + + http://www.ncbi.nlm.nih.gov/pubmed/20080505 + http://www.ncbi.nlm.nih.gov/pubmed/19451168 + +Incomplete list of citations (via HubMed.org): + + http://www.hubmed.org/references.cgi?uids=20080505 + http://www.hubmed.org/references.cgi?uids=19451168 + +Related projects: + + http://pbwa.sourceforge.net/ + http://www.many-core.group.cam.ac.uk/projects/lam.shtml + http://biodoop-seal.sourceforge.net/ + http://gitorious.org/bwa-cuda diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bamlite.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bamlite.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include "bamlite.h" + +/********************* + * from bam_endian.c * + *********************/ + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +/************** + * from bam.c * + **************/ + +int bam_is_be; + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + free(header->target_name[i]); + free(header->target_name); + free(header->target_len); + } + free(header->text); + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int magic_len; + int32_t i = 1, name_len; + // read "BAM1" + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); + return 0; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + bam_read(fp, &header->l_text, 4); + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + bam_read(fp, header->text, header->l_text); + bam_read(fp, &header->n_targets, 4); + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + bam_read(fp, &name_len, 4); + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + bam_read(fp, header->target_name[i], name_len); + bam_read(fp, &header->target_len[i], 4); + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - sizeof(bam1_core_t); + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + return 4 + block_len; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bamlite.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bamlite.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,94 @@ +#ifndef BAMLITE_H_ +#define BAMLITE_H_ + +#include +#include + +typedef gzFile bamFile; +#define bam_open(fn, mode) gzopen(fn, mode) +#define bam_dopen(fd, mode) gzdopen(fd, mode) +#define bam_close(fp) gzclose(fp) +#define bam_read(fp, buf, size) gzread(fp, buf, size) + +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + size_t l_text, n_text; + char *text; +} bam_header_t; + +#define BAM_FPAIRED 1 +#define BAM_FPROPER_PAIR 2 +#define BAM_FUNMAP 4 +#define BAM_FMUNMAP 8 +#define BAM_FREVERSE 16 +#define BAM_FMREVERSE 32 +#define BAM_FREAD1 64 +#define BAM_FREAD2 128 +#define BAM_FSECONDARY 256 +#define BAM_FQCFAIL 512 +#define BAM_FDUP 1024 + +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 + +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) +#define bam1_qname(b) ((char*)((b)->data)) +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) +#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) +#define bam_destroy1(b) do { \ + if (b) { free((b)->data); free(b); } \ + } while (0) + +extern int bam_is_be; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_header_t *bam_header_init(void); + void bam_header_destroy(bam_header_t *header); + bam_header_t *bam_header_read(bamFile fp); + int bam_read1(bamFile fp, bam1_t *b); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bntseq.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bntseq.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,323 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include "bntseq.h" +#include "main.h" +#include "utils.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +unsigned char nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +void bns_dump(const bntseq_t *bns, const char *prefix) +{ + char str[1024]; + FILE *fp; + int i; + { // dump .ann + strcpy(str, prefix); strcat(str, ".ann"); + fp = xopen(str, "w"); + fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); + for (i = 0; i != bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + fprintf(fp, "%d %s", p->gi, p->name); + if (p->anno[0]) fprintf(fp, " %s\n", p->anno); + else fprintf(fp, "\n"); + fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); + } + fclose(fp); + } + { // dump .amb + strcpy(str, prefix); strcat(str, ".amb"); + fp = xopen(str, "w"); + fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); + for (i = 0; i != bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); + } + fclose(fp); + } +} + +bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) +{ + char str[1024]; + FILE *fp; + bntseq_t *bns; + long long xx; + int i; + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + { // read .ann + fp = xopen(ann_filename, "r"); + fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + bns->l_pac = xx; + bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); + for (i = 0; i < bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + char *q = str; + int c; + // read gi and sequence name + fscanf(fp, "%u%s", &p->gi, str); + p->name = strdup(str); + // read fasta comments + while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + *q = 0; + if (q - str > 1) p->anno = strdup(str + 1); // skip leading space + else p->anno = strdup(""); + // read the rest + fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + p->offset = xx; + } + fclose(fp); + } + { // read .amb + int64_t l_pac; + int32_t n_seqs; + fp = xopen(amb_filename, "r"); + fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + l_pac = xx; + xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); + bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + for (i = 0; i < bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + fscanf(fp, "%lld%d%s", &xx, &p->len, str); + p->offset = xx; + p->amb = str[0]; + } + fclose(fp); + } + { // open .pac + bns->fp_pac = xopen(pac_filename, "rb"); + } + return bns; +} + +bntseq_t *bns_restore(const char *prefix) +{ + char ann_filename[1024], amb_filename[1024], pac_filename[1024]; + strcat(strcpy(ann_filename, prefix), ".ann"); + strcat(strcpy(amb_filename, prefix), ".amb"); + strcat(strcpy(pac_filename, prefix), ".pac"); + return bns_restore_core(ann_filename, amb_filename, pac_filename); +} + +void bns_destroy(bntseq_t *bns) +{ + if (bns == 0) return; + else { + int i; + if (bns->fp_pac) fclose(bns->fp_pac); + free(bns->ambs); + for (i = 0; i < bns->n_seqs; ++i) { + free(bns->anns[i].name); + free(bns->anns[i].anno); + } + free(bns->anns); + free(bns); + } +} + +#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) +#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) + +static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) +{ + bntann1_t *p; + int i, lasts; + if (bns->n_seqs == *m_seqs) { + *m_seqs <<= 1; + bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); + } + p = bns->anns + bns->n_seqs; + p->name = strdup((char*)seq->name.s); + p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->gi = 0; p->len = seq->seq.l; + p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; + p->n_ambs = 0; + for (i = lasts = 0; i < seq->seq.l; ++i) { + int c = nst_nt4_table[(int)seq->seq.s[i]]; + if (c >= 4) { // N + if (lasts == seq->seq.s[i]) { // contiguous N + ++(*q)->len; + } else { + if (bns->n_holes == *m_holes) { + (*m_holes) <<= 1; + bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); + } + *q = bns->ambs + bns->n_holes; + (*q)->len = 1; + (*q)->offset = p->offset + i; + (*q)->amb = seq->seq.s[i]; + ++p->n_ambs; + ++bns->n_holes; + } + } + lasts = seq->seq.s[i]; + { // fill buffer + if (c >= 4) c = lrand48()&3; + if (bns->l_pac == *m_pac) { // double the pac size + *m_pac <<= 1; + pac = realloc(pac, *m_pac/4); + memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); + } + _set_pac(pac, bns->l_pac, c); + ++bns->l_pac; + } + } + ++bns->n_seqs; + return pac; +} + +int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) +{ + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c + kseq_t *seq; + char name[1024]; + bntseq_t *bns; + uint8_t *pac = 0; + int32_t m_seqs, m_holes; + int64_t ret = -1, m_pac, l; + bntamb1_t *q; + FILE *fp; + + // initialization + seq = kseq_init(fp_fa); + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + bns->seed = 11; // fixed seed for random generator + srand48(bns->seed); + m_seqs = m_holes = 8; m_pac = 0x10000; + bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); + bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); + pac = calloc(m_pac/4, 1); + q = bns->ambs; + strcpy(name, prefix); strcat(name, ".pac"); + fp = xopen(name, "wb"); + // read sequences + while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); + if (!for_only) { // add the reverse complemented sequence + m_pac = (bns->l_pac * 2 + 3) / 4 * 4; + pac = realloc(pac, m_pac/4); + memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); + for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) + _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); + } + ret = bns->l_pac; + { // finalize .pac file + ubyte_t ct; + fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); + // the following codes make the pac file size always (l_pac/4+1+1) + if (bns->l_pac % 4 == 0) { + ct = 0; + fwrite(&ct, 1, 1, fp); + } + ct = bns->l_pac % 4; + fwrite(&ct, 1, 1, fp); + // close .pac file + fclose(fp); + } + bns_dump(bns, prefix); + bns_destroy(bns); + kseq_destroy(seq); + free(pac); + return ret; +} + +int bwa_fa2pac(int argc, char *argv[]) +{ + int c, for_only = 0; + gzFile fp; + while ((c = getopt(argc, argv, "f")) >= 0) { + switch (c) { + case 'f': for_only = 1; break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: bwa fa2pac [-f] []\n"); + return 1; + } + fp = xzopen(argv[optind], "r"); + bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); + gzclose(fp); + return 0; +} + +int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) +{ + int left, mid, right, nn; + if (ref_id) { + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + *ref_id = mid; + } + left = 0; right = bns->n_holes; nn = 0; + while (left < right) { + mid = (left + right) >> 1; + if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; + else if (pos_f + len <= bns->ambs[mid].offset) right = mid; + else { // overlap + if (pos_f >= bns->ambs[mid].offset) { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; + } else { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); + } + break; + } + } + return nn; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bntseq.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bntseq.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,85 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWT_BNTSEQ_H +#define BWT_BNTSEQ_H + +#include +#include + +#ifndef BWA_UBYTE +#define BWA_UBYTE +typedef uint8_t ubyte_t; +#endif + +typedef struct { + int64_t offset; + int32_t len; + int32_t n_ambs; + uint32_t gi; + char *name, *anno; +} bntann1_t; + +typedef struct { + int64_t offset; + int32_t len; + char amb; +} bntamb1_t; + +typedef struct { + int64_t l_pac; + int32_t n_seqs; + uint32_t seed; + bntann1_t *anns; // n_seqs elements + int32_t n_holes; + bntamb1_t *ambs; // n_holes elements + FILE *fp_pac; +} bntseq_t; + +extern unsigned char nst_nt4_table[256]; + +#ifdef __cplusplus +extern "C" { +#endif + + void bns_dump(const bntseq_t *bns, const char *prefix); + bntseq_t *bns_restore(const char *prefix); + bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); + void bns_destroy(bntseq_t *bns); + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + +#ifdef __cplusplus +} +#endif + +static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) +{ + return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos; +} + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwa.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwa.1 Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,571 @@ +.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools" +.SH NAME +.PP +bwa - Burrows-Wheeler Alignment Tool +.SH SYNOPSIS +.PP +bwa index -a bwtsw database.fasta +.PP +bwa aln database.fasta short_read.fastq > aln_sa.sai +.PP +bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam +.PP +bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam +.PP +bwa bwasw database.fasta long_read.fastq > aln.sam + +.SH DESCRIPTION +.PP +BWA is a fast light-weighted tool that aligns relatively short sequences +(queries) to a sequence database (targe), such as the human reference +genome. It implements two different algorithms, both based on +Burrows-Wheeler Transform (BWT). The first algorithm is designed for +short queries up to ~150bp with low error rate (<3%). It does gapped +global alignment w.r.t. queries, supports paired-end reads, and is one +of the fastest short read alignment algorithms to date while also +visiting suboptimal hits. The second algorithm, BWA-SW, is designed for +reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like +alignment to find high-scoring local hits and split hits. On +low-error short queries, BWA-SW is a little slower and less accurate than the +first algorithm, but on long queries, it is better. +.PP +For both algorithms, the database file in the FASTA format must be +first indexed with the +.B `index' +command, which typically takes a few hours for a 3GB genome. The first algorithm is +implemented via the +.B `aln' +command, which finds the suffix array (SA) coordinates of good hits of +each individual read, and the +.B `samse/sampe' +command, which converts SA coordinates to chromosomal coordinate and +pairs reads (for `sampe'). The second algorithm is invoked by the +.B `bwasw' +command. It works for single-end reads only. + +.SH COMMANDS AND OPTIONS +.TP +.B index +bwa index [-p prefix] [-a algoType] + +Index database sequences in the FASTA format. + +.B OPTIONS: +.RS +.TP 10 +.B -c +Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x) +.TP +.BI -p \ STR +Prefix of the output database [same as db filename] +.TP +.BI -a \ STR +Algorithm for constructing BWT index. Available options are: +.RS +.TP +.B is +IS linear-time algorithm for constructing suffix array. It requires +5.37N memory where N is the size of the database. IS is moderately fast, +but does not work with database larger than 2GB. IS is the default +algorithm due to its simplicity. The current codes for IS algorithm are +reimplemented by Yuta Mori. +.TP +.B bwtsw +Algorithm implemented in BWT-SW. This method works with the whole human +genome. +.RE +.RE + +.TP +.B aln +bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i +nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc] +[-O gapOsc] [-E gapEsc] [-q trimQual] > + + +Find the SA coordinates of the input reads. Maximum +.I maxSeedDiff +differences are allowed in the first +.I seedLen +subsequence and maximum +.I maxDiff +differences are allowed in the whole sequence. + +.B OPTIONS: +.RS +.TP 10 +.BI -n \ NUM +Maximum edit distance if the value is INT, or the fraction of missing +alignments given 2% uniform base error rate if FLOAT. In the latter +case, the maximum edit distance is automatically chosen for different +read lengths. [0.04] +.TP +.BI -o \ INT +Maximum number of gap opens [1] +.TP +.BI -e \ INT +Maximum number of gap extensions, -1 for k-difference mode (disallowing +long gaps) [-1] +.TP +.BI -d \ INT +Disallow a long deletion within INT bp towards the 3'-end [16] +.TP +.BI -i \ INT +Disallow an indel within INT bp towards the ends [5] +.TP +.BI -l \ INT +Take the first INT subsequence as seed. If INT is larger than the query +sequence, seeding will be disabled. For long reads, this option is +typically ranged from 25 to 35 for `-k 2'. [inf] +.TP +.BI -k \ INT +Maximum edit distance in the seed [2] +.TP +.BI -t \ INT +Number of threads (multi-threading mode) [1] +.TP +.BI -M \ INT +Mismatch penalty. BWA will not search for suboptimal hits with a score +lower than (bestScore-misMsc). [3] +.TP +.BI -O \ INT +Gap open penalty [11] +.TP +.BI -E \ INT +Gap extension penalty [4] +.TP +.BI -R \ INT +Proceed with suboptimal alignments if there are no more than INT equally +best hits. This option only affects paired-end mapping. Increasing this +threshold helps to improve the pairing accuracy at the cost of speed, +especially for short reads (~32bp). +.TP +.B -c +Reverse query but not complement it, which is required for alignment in +the color space. (Disabled since 0.6.x) +.TP +.B -N +Disable iterative search. All hits with no more than +.I maxDiff +differences will be found. This mode is much slower than the default. +.TP +.BI -q \ INT +Parameter for read trimming. BWA trims a read down to +argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l 1.sai + bwa aln ref.fa -b2 reads.bam > 2.sai + bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam +.TP +.B -0 +When +.B -b +is specified, only use single-end reads in mapping. +.TP +.B -1 +When +.B -b +is specified, only use the first read in a read pair in mapping (skip +single-end reads and the second reads). +.TP +.B -2 +When +.B -b +is specified, only use the second read in a read pair in mapping. +.B +.RE + +.TP +.B samse +bwa samse [-n maxOcc] > + +Generate alignments in the SAM format given single-end reads. Repetitive +hits will be randomly chosen. + +.B OPTIONS: +.RS +.TP 10 +.BI -n \ INT +Maximum number of alignments to output in the XA tag for reads paired +properly. If a read has more than INT hits, the XA tag will not be +written. [3] +.TP +.BI -r \ STR +Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.RE + +.TP +.B sampe +bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis] +[-P] > + +Generate alignments in the SAM format given paired-end reads. Repetitive +read pairs will be placed randomly. + +.B OPTIONS: +.RS +.TP 8 +.BI -a \ INT +Maximum insert size for a read pair to be considered being mapped +properly. Since 0.4.5, this option is only used when there are not +enough good alignment to infer the distribution of insert sizes. [500] +.TP +.BI -o \ INT +Maximum occurrences of a read for pairing. A read with more occurrneces +will be treated as a single-end read. Reducing this parameter helps +faster pairing. [100000] +.TP +.B -P +Load the entire FM-index into memory to reduce disk operations +(base-space reads only). With this option, at least 1.25N bytes of +memory are required, where N is the length of the genome. +.TP +.BI -n \ INT +Maximum number of alignments to output in the XA tag for reads paired +properly. If a read has more than INT hits, the XA tag will not be +written. [3] +.TP +.BI -N \ INT +Maximum number of alignments to output in the XA tag for disconcordant +read pairs (excluding singletons). If a read has more than INT hits, the +XA tag will not be written. [10] +.TP +.BI -r \ STR +Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.RE + +.TP +.B bwasw +bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t +nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N +nHspRev] [-c thresCoef] [mate.fq] + +Align query sequences in the +.I in.fq +file. When +.I mate.fq +is present, perform paired-end alignment. The paired-end mode only works +for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW +may still output split alignments but they are all marked as not properly +paired; the mate positions will not be written if the mate has multiple +local hits. + +.B OPTIONS: +.RS +.TP 10 +.BI -a \ INT +Score of a match [1] +.TP +.BI -b \ INT +Mismatch penalty [3] +.TP +.BI -q \ INT +Gap open penalty [5] +.TP +.BI -r \ INT +Gap extension penalty. The penalty for a contiguous gap of size k is +q+k*r. [2] +.TP +.BI -t \ INT +Number of threads in the multi-threading mode [1] +.TP +.BI -w \ INT +Band width in the banded alignment [33] +.TP +.BI -T \ INT +Minimum score threshold divided by a [37] +.TP +.BI -c \ FLOAT +Coefficient for threshold adjustment according to query length. Given an +l-long query, the threshold for a hit to be retained is +a*max{T,c*log(l)}. [5.5] +.TP +.BI -z \ INT +Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1] +.TP +.BI -s \ INT +Maximum SA interval size for initiating a seed. Higher -s increases +accuracy at the cost of speed. [3] +.TP +.BI -N \ INT +Minimum number of seeds supporting the resultant alignment to skip +reverse alignment. [5] +.RE + +.SH SAM ALIGNMENT FORMAT +.PP +The output of the +.B `aln' +command is binary and designed for BWA use only. BWA outputs the final +alignment in the SAM (Sequence Alignment/Map) format. Each line consists +of: + +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 QNAME Query (pair) NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIAGR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 ISIZE Inferred insert SIZE +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE + +.PP +Each bit in the FLAG field is defined as: + +.TS +center box; +cb | cb | cb +c | l | l . +Chr Flag Description +_ +p 0x0001 the read is paired in sequencing +P 0x0002 the read is mapped in a proper pair +u 0x0004 the query sequence itself is unmapped +U 0x0008 the mate is unmapped +r 0x0010 strand of the query (1 for reverse) +R 0x0020 strand of the mate +1 0x0040 the read is the first read in a pair +2 0x0080 the read is the second read in a pair +s 0x0100 the alignment is not primary +f 0x0200 QC failure +d 0x0400 optical or PCR duplicate +.TE + +.PP +The Please check for the format +specification and the tools for post-processing the alignment. + +BWA generates the following optional fields. Tags starting with `X' are +specific to BWA. + +.TS +center box; +cb | cb +cB | l . +Tag Meaning +_ +NM Edit distance +MD Mismatching positions/bases +AS Alignment score +BC Barcode sequence +_ +X0 Number of best hits +X1 Number of suboptimal hits found by BWA +XN Number of ambiguous bases in the referenece +XM Number of mismatches in the alignment +XO Number of gap opens +XG Number of gap extentions +XT Type: Unique/Repeat/N/Mate-sw +XA Alternative hits; format: (chr,pos,CIGAR,NM;)* +_ +XS Suboptimal alignment score +XF Support from forward/reverse alignment +XE Number of supporting seeds +.TE + +.PP +Note that XO and XG are generated by BWT search while the CIGAR string +by Smith-Waterman alignment. These two tags may be inconsistent with the +CIGAR string. This is not a bug. + +.SH NOTES ON SHORT-READ ALIGNMENT +.SS Alignment Accuracy +.PP +When seeding is disabled, BWA guarantees to find an alignment +containing maximum +.I maxDiff +differences including +.I maxGapO +gap opens which do not occur within +.I nIndelEnd +bp towards either end of the query. Longer gaps may be found if +.I maxGapE +is positive, but it is not guaranteed to find all hits. When seeding is +enabled, BWA further requires that the first +.I seedLen +subsequence contains no more than +.I maxSeedDiff +differences. +.PP +When gapped alignment is disabled, BWA is expected to generate the same +alignment as Eland version 1, the Illumina alignment program. However, as BWA +change `N' in the database sequence to random nucleotides, hits to these +random sequences will also be counted. As a consequence, BWA may mark a +unique hit as a repeat, if the random sequences happen to be identical +to the sequences which should be unqiue in the database. +.PP +By default, if the best hit is not highly repetitive (controlled by -R), BWA +also finds all hits contains one more mismatch; otherwise, BWA finds all +equally best hits only. Base quality is NOT considered in evaluating +hits. In the paired-end mode, BWA pairs all hits it found. It further +performs Smith-Waterman alignment for unmapped reads to rescue reads with a +high erro rate, and for high-quality anomalous pairs to fix potential alignment +errors. + +.SS Estimating Insert Size Distribution +.PP +BWA estimates the insert size distribution per 256*1024 read pairs. It +first collects pairs of reads with both ends mapped with a single-end +quality 20 or higher and then calculates median (Q2), lower and higher +quartile (Q1 and Q3). It estimates the mean and the variance of the +insert size distribution from pairs whose insert sizes are within +interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair +considered to be properly paired (SAM flag 0x2) is calculated by solving +equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the +standard error of the insert size distribution, L is the length of the +genome, p0 is prior of anomalous pair and Phi() is the standard +cumulative distribution function. For mapping Illumina short-insert +reads to the human genome, x is about 6-7 sigma away from the +mean. Quartiles, mean, variance and x will be printed to the standard +error output. + +.SS Memory Requirement +.PP +With bwtsw algorithm, 5GB memory is required for indexing the complete +human genome sequences. For short reads, the +.B aln +command uses ~3.2GB memory and the +.B sampe +command uses ~5.4GB. + +.SS Speed +.PP +Indexing the human genome sequences takes 3 hours with bwtsw +algorithm. Indexing smaller genomes with IS algorithms is +faster, but requires more memory. +.PP +The speed of alignment is largely determined by the error rate of the query +sequences (r). Firstly, BWA runs much faster for near perfect hits than +for hits with many differences, and it stops searching for a hit with +l+2 differences if a l-difference hit is found. This means BWA will be +very slow if r is high because in this case BWA has to visit hits with +many differences and looking for these hits is expensive. Secondly, the +alignment algorithm behind makes the speed sensitive to [k log(N)/m], +where k is the maximum allowed differences, N the size of database and m +the length of a query. In practice, we choose k w.r.t. r and therefore r +is the leading factor. I would not recommend to use BWA on data with +r>0.02. +.PP +Pairing is slower for shorter reads. This is mainly because shorter +reads have more spurious hits and converting SA coordinates to +chromosomal coordinates are very costly. + +.SH NOTES ON LONG-READ ALIGNMENT +.PP +Command +.B bwasw +is designed for long-read alignment. BWA-SW essentially aligns the trie +of the reference genome against the directed acyclic word graph (DAWG) of a +read to find seeds not highly repetitive in the genome, and then performs a +standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called +the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the +top Z reference suffix intervals that match the vertex. BWA-SW is more accurate +if the resultant alignment is supported by more seeds, and therefore BWA-SW +usually performs better on long queries or queries with low divergence to the +reference genome. + +BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads +mainly because it gives better gapped alignment. For paired-end reads, it is yet +to know whether BWA-short or BWA-SW yield overall better results. + +.SH CHANGES IN BWA-0.6 +.PP +Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. +This feature makes it possible to integrate the forward and reverse complemented +genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff, +BWA uses more memory because it has to keep all positions and ranks in 64-bit +integers, twice larger than 32-bit integers used in the previous versions. + +The latest BWA-SW also works for paired-end reads longer than 100bp. In +comparison to BWA-short, BWA-SW tends to be more accurate for highly unique +reads and more robust to relative long INDELs and structural variants. +Nonetheless, BWA-short usually has higher power to distinguish the optimal hit +from many suboptimal hits. The choice of the mapping algorithm may depend on +the application. + +.SH SEE ALSO +BWA website , Samtools website + + +.SH AUTHOR +Heng Li at the Sanger Institute wrote the key source codes and +integrated the following codes for BWT construction: bwtsw +, implemented by Chi-Kwong Wong at +the University of Hong Kong and IS + originally proposed by Nong Ge + at the Sun Yat-Sen University and +implemented by Yuta Mori. + +.SH LICENSE AND CITATION +.PP +The full BWA package is distributed under GPLv3 as it uses source codes +from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS +libraries are distributed under the MIT license. +.PP +If you use the short-read alignment component, please cite the following +paper: +.PP +Li H. and Durbin R. (2009) Fast and accurate short read alignment with +Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] +.PP +If you use the long-read component (BWA-SW), please cite: +.PP +Li H. and Durbin R. (2010) Fast and accurate long-read alignment with +Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] + +.SH HISTORY +BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW +and mimics its binary file formats; BWA-SW resembles BWT-SW in several +ways. The initial idea about BWT-based alignment also came from the +group who developed BWT-SW. At the same time, BWA is different enough +from BWT-SW. The short-read alignment algorithm bears no similarity to +Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it +introduces heuristics that can hardly be applied to the original +algorithm. In all, BWA does not guarantee to find all local hits as what +BWT-SW is designed to do, but it is much faster than BWT-SW on both +short and long query sequences. + +I started to write the first piece of codes on 24 May 2008 and got the +initial stable version on 02 June 2008. During this period, I was +acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper, +was collaborating with Beijing Genomics Institute on SOAP2, the successor +to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in +November 2008. According to the SourceForge download page, the third +BWT-based short read aligner, bowtie, was first released in August +2008. At the time of writing this manual, at least three more BWT-based +short-read aligners are being implemented. + +The BWA-SW algorithm is a new component of BWA. It was conceived in +November 2008 and implemented ten months later. diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwa.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,272 @@ +#include +#include +#include +#include +#include "bwa.h" +#include "bwt.h" +#include "bwtgap.h" +#include "bntseq.h" + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +extern unsigned char nst_nt4_table[256]; +extern void seq_reverse(int len, uint8_t *seq, int is_comp); + +bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; + +struct bwa_idx_t { + bwt_t *bwt; + bntseq_t *bns; + uint8_t *pac; +}; + +struct bwa_buf_t { + int max_buf; + bwa_pestat_t pes; + gap_stack_t *stack; + gap_opt_t *opt; + int *diff_tab; + uint8_t *buf; + int *logn; +}; + +bwa_idx_t *bwa_idx_load(const char *prefix) +{ + bwa_idx_t *p; + int l; + char *str; + l = strlen(prefix); + p = calloc(1, sizeof(bwa_idx_t)); + str = malloc(l + 10); + strcpy(str, prefix); + p->bns = bns_restore(str); + strcpy(str + l, ".bwt"); + p->bwt = bwt_restore_bwt(str); + str[l] = 0; + strcpy(str + l, ".sa"); + bwt_restore_sa(str, p->bwt); + free(str); + p->pac = calloc(p->bns->l_pac/4+1, 1); + fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); + fclose(p->bns->fp_pac); + p->bns->fp_pac = 0; + return p; +} + +void bwa_idx_destroy(bwa_idx_t *p) +{ + bns_destroy(p->bns); + bwt_destroy(p->bwt); + free(p->pac); + free(p); +} + +bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) +{ + extern gap_opt_t *gap_init_opt(void); + extern int bwa_cal_maxdiff(int l, double err, double thres); + int i; + bwa_buf_t *p; + p = malloc(sizeof(bwa_buf_t)); + p->stack = gap_init_stack2(max_score); + p->opt = gap_init_opt(); + p->opt->s_gapo = opt->s_gapo; + p->opt->s_gape = opt->s_gape; + p->opt->max_diff = opt->max_diff; + p->opt->max_gapo = opt->max_gapo; + p->opt->max_gape = opt->max_gape; + p->opt->seed_len = opt->seed_len; + p->opt->max_seed_diff = opt->max_seed_diff; + p->opt->fnr = opt->fnr; + p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); + for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) + p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + p->logn = calloc(256, sizeof(int)); + for (i = 1; i != 256; ++i) + p->logn[i] = (int)(4.343 * log(i) + 0.499); + return p; +} + +void bwa_buf_destroy(bwa_buf_t *p) +{ + gap_destroy_stack(p->stack); + free(p->diff_tab); free(p->logn); free(p->opt); + free(p); +} + +bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) +{ + extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); + int i, seq_len, buf_len; + bwt_width_t *w, *seed_w; + uint8_t *s; + gap_opt_t opt2 = *buf->opt; + bwa_sai_t sai; + + seq_len = strlen(seq); + // estimate the buffer length + buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; + if (buf_len > buf->max_buf) { + buf->max_buf = buf_len; + kroundup32(buf->max_buf); + buf->buf = realloc(buf->buf, buf->max_buf); + } + memset(buf->buf, 0, buf_len); + seed_w = (bwt_width_t*)buf->buf; + w = seed_w + buf->opt->seed_len; + s = (uint8_t*)(w + seq_len + 1); + if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len]; + // copy the sequence + for (i = 0; i < seq_len; ++i) + s[i] = nst_nt4_table[(int)seq[i]]; + seq_reverse(seq_len, s, 0); + // mapping + bwt_cal_width(idx->bwt, seq_len, s, w); + if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; + if (seq_len > buf->opt->seed_len) + bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w); + for (i = 0; i < seq_len; ++i) // complement; I forgot why... + s[i] = s[i] > 3? 4 : 3 - s[i]; + sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack); + return sai; +} + +static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) +{ + uint64_t x = pos, z; + int k, y = 0; + *n_mm = *n_gaps = 0; + for (k = 0; k < n_cigar; ++k) { + int l = cigar[k]>>4; + int op = cigar[k]&0xf; + if (op == 0) { // match/mismatch + for (z = 0; z < l && x + z < l_pac; ++z) { + int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm); + } + } + if (op == 1 || op == 2) (*n_gaps) += l; + if (op == 0 || op == 2) x += l; + if (op == 0 || op == 1 || op == 4) y += l; + } +} + +void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln) +{ + extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); + extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); + int strand, seq_len, i, n_gap, n_mm; + uint64_t pos3, pac_pos; + uint8_t *s[2]; + + memset(aln, 0, sizeof(bwa_aln_t)); + seq_len = strlen(seq); + if (seq_len<<1 > buf->max_buf) { + buf->max_buf = seq_len<<1; + kroundup32(buf->max_buf); + buf->buf = realloc(buf->buf, buf->max_buf); + } + s[0] = buf->buf; + s[1] = s[0] + seq_len; + for (i = 0; i < seq_len; ++i) + s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; + seq_reverse(seq_len, s[1], 1); + pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); + if (strand) aln->flag |= 16; + if (n_gaps) { // only for gapped alignment + int n_cigar; + bwa_cigar_t *cigar16; + cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); + aln->n_cigar = n_cigar; + aln->cigar = malloc(n_cigar * 4); + for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { + int op = cigar16[i]>>14; + int len = cigar16[i]&0x3fff; + if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR + aln->cigar[i] = len<<4 | op; + if (op == 0 || op == 2) pos3 += len; + } + free(cigar16); + } else { // ungapped + aln->n_cigar = 1; + aln->cigar = malloc(4); + aln->cigar[0] = seq_len<<4 | 0; + pos3 = pac_pos + seq_len; + } + aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id); + aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset; + if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence + aln->flag |= 4; // read unmapped + compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap); + aln->n_mm = n_mm; + aln->n_gap = n_gap; +} + +/************************ + * Single-end alignment * + ************************/ + +bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) +{ + bwa_one_t *one; + int best, cnt, i, seq_len; + + seq_len = strlen(seq); + one = calloc(1, sizeof(bwa_one_t)); + one->sai = bwa_sai(idx, buf, seq); + if (one->sai.n == 0) return one; + // count number of hits; randomly select one alignment + best = one->sai.sai[0].score; + for (i = cnt = 0; i < one->sai.n; ++i) { + bwa_sai1_t *p = &one->sai.sai[i]; + if (p->score > best) break; + if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { + one->which = p; + one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); + } + cnt += p->l - p->k + 1; + } + one->c1 = cnt; + for (; i < one->sai.n; ++i) + cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1; + one->c2 = cnt - one->c1; + // estimate single-end mapping quality + one->mapQs = -1; + if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible? + else if (one->c1 > 1) one->mapQs = 0; + else { + int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape; + if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25; + else if (one->c2 == 0) one->mapQs = 37; + } + if (one->mapQs < 0) { + cnt = (one->c2 >= 255)? 255 : one->c2; + one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt]; + } + one->mapQ = one->mapQs; + // compute CIGAR on request + one->one.ref_id = -1; + if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one); + return one; +} + +void bwa_one_destroy(bwa_one_t *one) +{ + free(one->sai.sai); + free(one->one.cigar); + free(one); +} + +/************************ + * Paired-end alignment * + ************************/ + +void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2]) +{ +} + +void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2]) +{ +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwa.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwa.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,107 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include + +#define BWA_DEF_MAX_SCORE 2048 +#define BWA_MAX_QUERY_LEN 1024 + +// BWA index +struct bwa_idx_t; +typedef struct bwa_idx_t bwa_idx_t; + +// Buffer for BWA alignment +struct bwa_buf_t; +typedef struct bwa_buf_t bwa_buf_t; + +// BWA alignment options +typedef struct { + int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3 + int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions + int seed_len, max_seed_diff; // seed length and max differences allowed in the seed + float fnr; // parameter for automatic length-adjusted max differences +} bwa_opt_t; + +// default BWA alignment options +extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 } + +// an interval hit in the SA coordinate; basic unit in .sai files +typedef struct { + uint32_t n_mm:16, n_gapo:8, n_gape:8; + int score; + uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits +} bwa_sai1_t; + +// all interval hits in the SA coordinate +typedef struct { + int n; // number of interval hits + bwa_sai1_t *sai; +} bwa_sai_t; + +// an alignment +typedef struct { + uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment + int32_t ref_id; // referece sequence index (the first seq is indexed by 0) + uint32_t offset; // coordinate on the reference; zero-based + uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag + uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations +} bwa_aln_t; + +typedef struct { + int mapQs, mapQ, c1, c2; + uint64_t sa; + bwa_sai1_t *which; + bwa_sai_t sai; + bwa_aln_t one; +} bwa_one_t; + +typedef struct { + double avg, std, ap_prior; + uint64_t low, high, high_bayesian; +} bwa_pestat_t; + +#ifdef __cplusplus +extern "C" { +#endif + + // load a BWA index + bwa_idx_t *bwa_idx_load(const char *prefix); + void bwa_idx_destroy(bwa_idx_t *p); + + // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE + bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score); + void bwa_buf_destroy(bwa_buf_t *p); + + /** + * Find all the SA intervals + * + * @param idx BWA index; multiple threads can share the same index + * @param buf BWA alignment buffer; each thread should have its own buffer + * @param seq NULL terminated C string, consisting of A/C/G/T/N only + * + * @return SA intervals seq is matched to + */ + bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); + + /** + * Construct an alignment in the base-pair coordinate + * + * @param idx BWA index + * @param buf BWA alignment buffer + * @param seq NULL terinated C string + * @param sa Suffix array value + * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape + * + * @return An alignment + */ + void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln); + + bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); + + void bwa_one_destroy(bwa_one_t *one); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwape.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwape.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,824 @@ +#include +#include +#include +#include +#include +#include +#include "bwtaln.h" +#include "kvec.h" +#include "bntseq.h" +#include "utils.h" +#include "stdaln.h" +#include "bwase.h" + +typedef struct { + int n; + bwtint_t *a; +} poslist_t; + +typedef struct { + double avg, std, ap_prior; + bwtint_t low, high, high_bayesian; +} isize_info_t; + +typedef struct { + uint64_t x, y; +} b128_t; + +#define b128_lt(a, b) ((a).x < (b).x) +#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) +#define b128_hash(a) ((uint32_t)(a).x) + +#include "khash.h" +KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) + +#include "ksort.h" +KSORT_INIT(b128, b128_t, b128_lt) +KSORT_INIT_GENERIC(uint64_t) + +typedef struct { + kvec_t(b128_t) arr; + kvec_t(b128_t) pos[2]; + kvec_t(bwt_aln1_t) aln[2]; +} pe_data_t; + +#define MIN_HASH_WIDTH 1000 + +extern int g_log_n[256]; // in bwase.c +static kh_b128_t *g_hash; + +void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +int bwa_approx_mapQ(const bwa_seq_t *p, int mm); +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); +bntseq_t *bwa_open_nt(const char *prefix); +void bwa_print_sam_SQ(const bntseq_t *bns); +void bwa_print_sam_PG(); + +pe_opt_t *bwa_init_pe_opt() +{ + pe_opt_t *po; + po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); + po->max_isize = 500; + po->force_isize = 0; + po->max_occ = 100000; + po->n_multi = 3; + po->N_multi = 10; + po->type = BWA_PET_STD; + po->is_sw = 1; + po->ap_prior = 1e-5; + return po; +} + +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} +/* +static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); +{ + const double a = 0.140012; + double b, c; + b = log(x * (2 - x)); + c = 2./M_PI/a + b / 2.; + return sqrt(sqrt(c * c - b / a) - c); +} +*/ + +// for normal distribution, this is about 3std +#define OUTLIER_BOUND 2.0 + +static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L) +{ + uint64_t x, *isizes, n_ap = 0; + int n, i, tot, p25, p75, p50, max_len = 1, tmp; + double skewness = 0.0, kurtosis = 0.0, y; + + ii->avg = ii->std = -1.0; + ii->low = ii->high = ii->high_bayesian = 0; + isizes = (uint64_t*)calloc(n_seqs, 8); + for (i = 0, tot = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) { + x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos; + if (x < 100000) isizes[tot++] = x; + } + if (p[0]->len > max_len) max_len = p[0]->len; + if (p[1]->len > max_len) max_len = p[1]->len; + } + if (tot < 20) { + fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n"); + free(isizes); + return -1; + } + ks_introsort(uint64_t, tot, isizes); + p25 = isizes[(int)(tot*0.25 + 0.5)]; + p50 = isizes[(int)(tot*0.50 + 0.5)]; + p75 = isizes[(int)(tot*0.75 + 0.5)]; + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned + ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + for (i = 0, x = n = 0; i < tot; ++i) + if (isizes[i] >= ii->low && isizes[i] <= ii->high) + ++n, x += isizes[i]; + ii->avg = (double)x / n; + for (i = 0; i < tot; ++i) { + if (isizes[i] >= ii->low && isizes[i] <= ii->high) { + double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg); + ii->std += tmp; + skewness += tmp * (isizes[i] - ii->avg); + kurtosis += tmp * tmp; + } + } + kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3; + ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large + skewness = skewness / n / (ii->std * ii->std * ii->std); + for (y = 1.0; y < 10.0; y += 0.01) + if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; + ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); + for (i = 0; i < tot; ++i) + if (isizes[i] > ii->high_bayesian) ++n_ap; + ii->ap_prior = .01 * (n_ap + .01) / tot; + if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior; + free(isizes); + fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75); + if (isnan(ii->std) || p75 > 100000) { + ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0; + fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n"); + return -1; + } + for (y = 1.0; y < 10.0; y += 0.01) + if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; + ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); + fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high); + fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); + fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior); + fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y); + return 0; +} + +static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) +{ + int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; + uint64_t o_score, subo_score; + b128_t last_pos[2][2], o_pos[2]; + max_len = p[0]->full_len; + if (max_len < p[1]->full_len) max_len = p[1]->full_len; + if (low_bound < max_len) low_bound = max_len; + + // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize +#define __pairing_aux(u,v) do { \ + bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \ + if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \ + && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ + { \ + uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \ + s *= 10; \ + if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ + s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \ + if (s>>32 == o_score>>32) ++o_n; \ + else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ + else ++subo_n; \ + if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \ + else if (s < subo_score) subo_score = s; \ + } \ + } while (0) + +#define __pairing_aux2(q, w) do { \ + const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \ + (q)->extra_flag |= SAM_FPP; \ + if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \ + (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \ + (q)->score = r->score; \ + (q)->pos = (w).x; \ + if ((q)->mapQ > 0) ++cnt_chg; \ + } \ + } while (0) + + o_score = subo_score = (uint64_t)-1; + o_n = subo_n = 0; + ks_introsort(b128, d->arr.n, d->arr.a); + for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; + if (opt->type == BWA_PET_STD) { + for (i = 0; i < d->arr.n; ++i) { + b128_t x = d->arr.a[i]; + int strand = x.y>>1&1; + if (strand == 1) { // reverse strand, then check + int y = 1 - (x.y&1); + __pairing_aux(last_pos[y][1], x); + __pairing_aux(last_pos[y][0], x); + } else { // forward strand, then push + last_pos[x.y&1][0] = last_pos[x.y&1][1]; + last_pos[x.y&1][1] = x; + } + } + } else if (opt->type == BWA_PET_SOLID) { + for (i = 0; i < d->arr.n; ++i) { + b128_t x = d->arr.a[i]; + int strand = x.y>>1&1; + if ((strand^x.y)&1) { // push + int y = 1 - (x.y&1); + __pairing_aux(last_pos[y][1], x); + __pairing_aux(last_pos[y][0], x); + } else { // check + last_pos[x.y&1][0] = last_pos[x.y&1][1]; + last_pos[x.y&1][1] = x; + } + } + } else { + fprintf(stderr, "[paring] not implemented yet!\n"); + exit(1); + } + // set pairing + //fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); + if (o_score != (uint64_t)-1) { + int mapQ_p = 0; // this is the maximum mapping quality when one end is moved + //fprintf(stderr, "%d, %d\n", o_n, subo_n); + if (o_n == 1) { + if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair + else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair + else { + int n = subo_n > 255? 255 : subo_n; + mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n]; + if (mapQ_p < 0) mapQ_p = 0; + } + } + if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved + if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { + int mapQ = p[0]->mapQ + p[1]->mapQ; + if (mapQ > 60) mapQ = 60; + p[0]->mapQ = p[1]->mapQ = mapQ; + } else { + if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; + if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; + } + } else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved + p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; + if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; + } else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved + p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; + if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; + } else { // both ends moved + p[0]->seQ = p[1]->seQ = 0; + mapQ_p -= 20; + if (mapQ_p < 0) mapQ_p = 0; + p[0]->mapQ = p[1]->mapQ = mapQ_p; + } + __pairing_aux2(p[0], o_pos[0]); + __pairing_aux2(p[1], o_pos[1]); + } + return cnt_chg; +} + +typedef struct { + kvec_t(bwt_aln1_t) aln; +} aln_buf_t; + +int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, + const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) +{ + int i, j, cnt_chg = 0; + char str[1024]; + bwt_t *bwt; + pe_data_t *d; + aln_buf_t *buf[2]; + + d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); + buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + + if (_bwt == 0) { // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + } else bwt = _bwt; + + // SE + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + for (j = 0; j < 2; ++j) { + int n_aln; + p[j] = seqs[j] + i; + p[j]->n_multi = 0; + p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); + fread(&n_aln, 4, 1, fp_sa[j]); + if (n_aln > kv_max(d->aln[j])) + kv_resize(bwt_aln1_t, d->aln[j], n_aln); + d->aln[j].n = n_aln; + fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); + kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] + // generate SE alignment and mapping quality + bwa_aln2seq(n_aln, d->aln[j].a, p[j]); + if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { + int strand; + int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; + p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); + p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len, &strand); + p[j]->strand = strand; + } + } + } + + // infer isize + infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2); + if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; + if (opt->force_isize) { + fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__); + ii->low = ii->high = 0; ii->avg = ii->std = -1.0; + } + + // PE + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + for (j = 0; j < 2; ++j) { + p[j] = seqs[j] + i; + kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln); + } + if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) + && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) + { // only when both ends mapped + b128_t x; + int j, k; + long long n_occ[2]; + for (j = 0; j < 2; ++j) { + n_occ[j] = 0; + for (k = 0; k < d->aln[j].n; ++k) + n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1; + } + if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue; + d->arr.n = 0; + for (j = 0; j < 2; ++j) { + for (k = 0; k < d->aln[j].n; ++k) { + bwt_aln1_t *r = d->aln[j].a + k; + bwtint_t l; + if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table + b128_t key; + int ret; + key.x = r->k; key.y = r->l; + khint_t iter = kh_put(b128, g_hash, key, &ret); + if (ret) { // not in the hash table; ret must equal 1 as we never remove elements + poslist_t *z = &kh_val(g_hash, iter); + z->n = r->l - r->k + 1; + z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); + for (l = r->k; l <= r->l; ++l) { + int strand; + z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand)<<1; + z->a[l - r->k] |= strand; + } + } + for (l = 0; l < kh_val(g_hash, iter).n; ++l) { + x.x = kh_val(g_hash, iter).a[l]>>1; + x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; + kv_push(b128_t, d->arr, x); + } + } else { // then calculate on the fly + for (l = r->k; l <= r->l; ++l) { + int strand; + x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); + x.y = k<<2 | strand<<1 | j; + kv_push(b128_t, d->arr, x); + } + } + } + } + cnt_chg += pairing(p, d, opt, gopt->s_mm, ii); + } + + if (opt->N_multi || opt->n_multi) { + for (j = 0; j < 2; ++j) { + if (p[j]->type != BWA_TYPE_NO_MATCH) { + int k, n_multi; + if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { + bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); + } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); + for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { + int strand; + bwt_multi1_t *q = p[j]->multi + k; + q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand); + q->strand = strand; + if (q->pos != p[j]->pos) + p[j]->multi[n_multi++] = *q; + } + p[j]->n_multi = n_multi; + } + } + } + } + + // free + for (i = 0; i < n_seqs; ++i) { + kv_destroy(buf[0][i].aln); + kv_destroy(buf[1][i].aln); + } + free(buf[0]); free(buf[1]); + if (_bwt == 0) bwt_destroy(bwt); + kv_destroy(d->arr); + kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); + kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); + free(d); + return cnt_chg; +} + +#define SW_MIN_MATCH_LEN 20 +#define SW_MIN_MAPQ 17 + +// cnt = n_mm<<16 | n_gapo<<8 | n_gape +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, + int *n_cigar, uint32_t *_cnt) +{ + bwa_cigar_t *cigar = 0; + ubyte_t *ref_seq; + bwtint_t k, x, y, l; + int path_len, ret, subo; + AlnParam ap = aln_param_bwa; + path_t *path, *p; + + // check whether there are too many N's + if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; + for (k = 0, x = 0; k < len; ++k) + if (seq[k] >= 4) ++x; + if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; + + // get reference subsequence + ref_seq = (ubyte_t*)calloc(reglen, 1); + for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + path = (path_t*)calloc(l+len, sizeof(path_t)); + + // do alignment + ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); + if (ret < 0 || subo == ret) { // no hit or tandem hits + free(path); free(cigar); free(ref_seq); *n_cigar = 0; + return 0; + } + cigar = bwa_aln_path2cigar(path, path_len, n_cigar); + + // check whether the alignment is good enough + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + bwa_cigar_t c = cigar[k]; + if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c); + else if (__cigar_op(c) == FROM_D) x += __cigar_len(c); + else y += __cigar_len(c); + } + if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough + free(path); free(cigar); free(ref_seq); + *n_cigar = 0; + return 0; + } + + { // update cigar and coordinate; + int start, end; + p = path + path_len - 1; + *beg += (p->i? p->i : 1) - 1; + start = (p->j? p->j : 1) - 1; + end = path->j; + cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); + if (start) { + memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); + cigar[0] = __cigar_create(3, start); + ++(*n_cigar); + } + if (end < len) { + /*cigar[*n_cigar] = 3<<14 | (len - end);*/ + cigar[*n_cigar] = __cigar_create(3, (len - end)); + ++(*n_cigar); + } + } + + { // set *cnt + int n_mm, n_gapo, n_gape; + n_mm = n_gapo = n_gape = 0; + p = path + path_len - 1; + x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + for (k = 0; k < *n_cigar; ++k) { + bwa_cigar_t c = cigar[k]; + if (__cigar_op(c) == FROM_M) { + for (l = 0; l < (__cigar_len(c)); ++l) + if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm; + x += __cigar_len(c), y += __cigar_len(c); + } else if (__cigar_op(c) == FROM_D) { + x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; + } else if (__cigar_op(c) == FROM_I) { + y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; + } + } + *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; + } + + free(ref_seq); free(path); + return cigar; +} + +ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii) +{ + ubyte_t *pacseq; + int i; + uint64_t n_tot[2], n_mapped[2]; + + // load reference sequence + if (_pacseq == 0) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = (ubyte_t*)_pacseq; + if (!popt->is_sw || ii->avg < 0.0) return pacseq; + + // perform mate alignment + n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ + int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2]; + int64_t beg[2], end[2]; + bwa_cigar_t *cigar[2]; + uint32_t cnt[2]; + + /* In the following, _pref points to the reference read + * which must be aligned; _pmate points to its mate which is + * considered to be modified. */ + +#define __set_rght_coor(_a, _b, _pref, _pmate) do { \ + (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \ + (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ + if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \ + if ((_b) > bns->l_pac) (_b) = bns->l_pac; \ + } while (0) + +#define __set_left_coor(_a, _b, _pref, _pmate) do { \ + (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \ + (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ + if ((_a) < 0) (_a) = 0; \ + if ((_b) > _pref->pos) (_b) = _pref->pos; \ + } while (0) + +#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \ + _pmate->type = BWA_TYPE_MATESW; \ + _pmate->pos = _beg; \ + _pmate->seQ = _pref->seQ; \ + _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \ + _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \ + _pmate->extra_flag |= SAM_FPP; \ + _pref->extra_flag |= SAM_FPP; \ + } while (0) + + mq_adjust[0] = mq_adjust[1] = 255; // not effective + is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0; + + ++n_tot[is_singleton]; + cigar[0] = cigar[1] = 0; + n_cigar[0] = n_cigar[1] = 0; + if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified + ubyte_t *seq; + if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip + if (popt->type == BWA_PET_STD) { + if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate + __set_rght_coor(beg[k], end[k], p[1-k], p[k]); + seq = p[k]->rseq; + } else { // then the mate is on forward stand and has smaller coordinate + __set_left_coor(beg[k], end[k], p[1-k], p[k]); + seq = p[k]->seq; + seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly + } + } else { // BWA_PET_SOLID + if (p[1-k]->strand == 0) { // R3-F3 pairing + if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 + else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 + seq = p[k]->rseq; + seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed + } else { // F3-R3 pairing + if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 + else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 + seq = p[k]->seq; + } + } + // perform SW alignment + cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); + if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k] + int s_old, clip = 0, s_new; + if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]); + if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]); + s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499); + s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499); + s_old += -4.343 * log(ii->ap_prior / bns->l_pac); + s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma + if (s_old < s_new) { // reject SW alignment + mq_adjust[k] = s_new - s_old; + free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0; + } else mq_adjust[k] = s_old - s_new; + } + // now revserse sequence back such that p[*]->seq looks untouched + if (popt->type == BWA_PET_STD) { + if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0); + } else { + if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0); + } + } + k = -1; // no read to be changed + if (cigar[0] && cigar[1]) { + k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed + mapQ = abs(p[1]->mapQ - p[0]->mapQ); + } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ; + else if (cigar[1]) k = 1, mapQ = p[0]->mapQ; + if (k >= 0 && p[k]->pos != beg[k]) { + ++n_mapped[is_singleton]; + { // recalculate mapping quality + int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8; + if (tmp <= 0) tmp = 1; + if (mapQ > tmp) mapQ = tmp; + p[k]->mapQ = p[1-k]->mapQ = mapQ; + p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ; + if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k]; + if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k]; + } + // update CIGAR + free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0; + p[k]->n_cigar = n_cigar[k]; + // update the rest of information + __set_fixed(p[1-k], p[k], beg[k], cnt[k]); + } + free(cigar[0]); free(cigar[1]); + } + } + fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n", + (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ); + fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n", + (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ); + return pacseq; +} + +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +{ + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, j, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs[2]; + bwa_seqio_t *ks[2]; + clock_t t; + bntseq_t *bns, *ntbns = 0; + FILE *fp_sa[2]; + gap_opt_t opt, opt0; + khint_t iter; + isize_info_t last_ii; // this is for the last batch of reads + char str[1024]; + bwt_t *bwt; + uint8_t *pac; + + // initialization + bwase_initialize(); // initialize g_log_n[] in bwase.c + pac = 0; bwt = 0; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); + bns = bns_restore(prefix); + srand48(bns->seed); + fp_sa[0] = xopen(fn_sa[0], "r"); + fp_sa[1] = xopen(fn_sa[1], "r"); + g_hash = kh_init(b128); + last_ii.avg = -1.0; + + fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); + ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); + opt0 = opt; + fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! + ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); + if (!(opt.mode & BWA_MODE_COMPREAD)) { + popt->type = BWA_PET_SOLID; + ntbns = bwa_open_nt(prefix); + } else { // for Illumina alignment only + if (popt->is_preload) { + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + } + } + + // core loop + bwa_print_sam_SQ(bns); + bwa_print_sam_PG(); + while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { + int cnt_chg; + isize_info_t ii; + ubyte_t *pacseq; + + seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual); + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); + cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); + fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); + + fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n"); + pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii); + fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); + for (j = 0; j < 2; ++j) + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + if (pac == 0) free(pacseq); + + fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if (p[0]->bc[0] || p[1]->bc[0]) { + strcat(p[0]->bc, p[1]->bc); + strcpy(p[1]->bc, p[0]->bc); + } + bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); + bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + for (j = 0; j < 2; ++j) + bwa_free_read_seq(n_seqs, seqs[j]); + fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs); + last_ii = ii; + } + + // destroy + bns_destroy(bns); + if (ntbns) bns_destroy(ntbns); + for (i = 0; i < 2; ++i) { + bwa_seq_close(ks[i]); + fclose(fp_sa[i]); + } + for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) + if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); + kh_destroy(b128, g_hash); + if (pac) { + free(pac); bwt_destroy(bwt); + } +} + +int bwa_sai2sam_pe(int argc, char *argv[]) +{ + extern char *bwa_rg_line, *bwa_rg_id; + extern int bwa_set_rg(const char *s); + extern char *bwa_infer_prefix(const char *hint); + int c; + pe_opt_t *popt; + char *prefix; + + popt = bwa_init_pe_opt(); + while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { + switch (c) { + case 'r': + if (bwa_set_rg(optarg) < 0) { + fprintf(stderr, "[%s] malformated @RG line\n", __func__); + return 1; + } + break; + case 'a': popt->max_isize = atoi(optarg); break; + case 'o': popt->max_occ = atoi(optarg); break; + case 's': popt->is_sw = 0; break; + case 'P': popt->is_preload = 1; break; + case 'n': popt->n_multi = atoi(optarg); break; + case 'N': popt->N_multi = atoi(optarg); break; + case 'c': popt->ap_prior = atof(optarg); break; + case 'f': xreopen(optarg, "w", stdout); break; + case 'A': popt->force_isize = 1; break; + default: return 1; + } + } + + if (optind + 5 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa sampe [options] \n\n"); + fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); + fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); + fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); + fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); + fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior); + fprintf(stderr, " -f FILE sam file to output results to [stdout]\n"); + fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); + fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); + fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); + fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); + fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); + fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); + fprintf(stderr, "\n"); + return 1; + } + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(bwa_rg_line); free(bwa_rg_id); + return 0; + } + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt); + free(bwa_rg_line); free(bwa_rg_id); free(prefix); + free(popt); + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwase.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwase.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,683 @@ +#include +#include +#include +#include +#include +#include +#include "stdaln.h" +#include "bwase.h" +#include "bwtaln.h" +#include "bntseq.h" +#include "utils.h" +#include "kstring.h" + +int g_log_n[256]; +char *bwa_rg_line, *bwa_rg_id; + +void bwa_print_sam_PG(); + +void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +{ + int i, cnt, best; + if (n_aln == 0) { + s->type = BWA_TYPE_NO_MATCH; + s->c1 = s->c2 = 0; + return; + } + + if (set_main) { + best = aln[0].score; + for (i = cnt = 0; i < n_aln; ++i) { + const bwt_aln1_t *p = aln + i; + if (p->score > best) break; + if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { + s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; + s->score = p->score; + s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); + } + cnt += p->l - p->k + 1; + } + s->c1 = cnt; + for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; + s->c2 = cnt - s->c1; + s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; + } + + if (n_multi) { + int k, rest, n_occ, z = 0; + for (k = n_occ = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + n_occ += q->l - q->k + 1; + } + if (s->multi) free(s->multi); + if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them + s->multi = 0; s->n_multi = 0; + return; + } + /* The following code is more flexible than what is required + * here. In principle, due to the requirement above, we can + * simply output all hits, but the following samples "rest" + * number of random hits. */ + rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa + s->multi = calloc(rest, sizeof(bwt_multi1_t)); + for (k = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + if (q->l - q->k + 1 <= rest) { + bwtint_t l; + for (l = q->k; l <= q->l; ++l) { + s->multi[z].pos = l; + s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z++].mm = q->n_mm; + } + rest -= q->l - q->k + 1; + } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. + int j, i, k; + for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { + double p = 1.0, x = drand48(); + while (x < p) p -= p * j / (i--); + s->multi[z].pos = q->l - i; + s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z++].mm = q->n_mm; + } + rest = 0; + break; + } + } + s->n_multi = z; + } +} + +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) +{ + bwa_aln2seq_core(n_aln, aln, s, 1, 0); +} + +int bwa_approx_mapQ(const bwa_seq_t *p, int mm) +{ + int n; + if (p->c1 == 0) return 23; + if (p->c1 > 1) return 0; + if (p->n_mm == mm) return 25; + if (p->c2 == 0) return 37; + n = (p->c2 >= 255)? 255 : p->c2; + return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; +} + +bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) +{ + bwtint_t pos_f; + int is_rev; + pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f + *strand = !is_rev; + /* NB: For gapped alignment, pacpos may not be correct, which will be fixed + * in bwa_refine_gapped_core(). This line also determines the way "x" is + * calculated in bwa_refine_gapped_core() when (ext < 0 && is_end == 0). */ + if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand + return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset +} + +/** + * Derive the actual position in the read from the given suffix array + * coordinates. Note that the position will be approximate based on + * whether indels appear in the read and whether calculations are + * performed from the start or end of the read. + */ +void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr) +{ + int max_diff, strand; + if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; + max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand); + seq->strand = strand; + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); +} + +void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) +{ + int i, j, strand, n_multi; + char str[1024]; + bwt_t *bwt; + // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = &seqs[i]; + bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); + for (j = n_multi = 0; j < p->n_multi; ++j) { + bwt_multi1_t *q = p->multi + j; + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len, &strand); + q->strand = strand; + if (q->pos != p->pos) + p->multi[n_multi++] = *q; + } + p->n_multi = n_multi; + } + bwt_destroy(bwt); +} + +/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on + * forward strand. This happens when p->pos is calculated by + * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct + * coordinate. This happens only for color-converted alignment. */ +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, + int ext, int *n_cigar, int is_end_correct) +{ + bwa_cigar_t *cigar = 0; + ubyte_t *ref_seq; + int l = 0, path_len, ref_len; + AlnParam ap = aln_param_bwa; + path_t *path; + int64_t k, __pos = *_pos; + + ref_len = len + abs(ext); + if (ext > 0) { + ref_seq = (ubyte_t*)calloc(ref_len, 1); + for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + } else { + int64_t x = __pos + (is_end_correct? len : ref_len); + ref_seq = (ubyte_t*)calloc(ref_len, 1); + for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + } + path = (path_t*)calloc(l+len, sizeof(path_t)); + + aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); + cigar = bwa_aln_path2cigar(path, path_len, n_cigar); + + if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand + for (l = k = 0; k < *n_cigar; ++k) { + if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); + else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); + } + __pos += l; + } + + if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end + __pos += __cigar_len(cigar[0]); + for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; + --(*n_cigar); + } + if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end + + // change "I" at either end of the read to S. just in case. This should rarely happen... + if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); + if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); + + *_pos = (bwtint_t)__pos; + free(ref_seq); free(path); + return cigar; +} + +char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, + bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) +{ + bwtint_t x, y; + int z, u, c, nm = 0; + str->l = 0; // reset + x = pos; y = 0; + if (cigar) { + int k, l; + for (k = u = 0; k < n_cigar; ++k) { + l = __cigar_len(cigar[k]); + if (__cigar_op(cigar[k]) == FROM_M) { + for (z = 0; z < l && x+z < l_pac; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + x += l; y += l; + } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { + y += l; + if (__cigar_op(cigar[k]) == FROM_I) nm += l; + } else if (__cigar_op(cigar[k]) == FROM_D) { + ksprintf(str, "%d", u); + kputc('^', str); + for (z = 0; z < l && x+z < l_pac; ++z) + kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); + u = 0; + x += l; nm += l; + } + } + } else { // no gaps + for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + } + ksprintf(str, "%d", u); + *_nm = nm; + return strdup(str->s); +} + +void bwa_correct_trimmed(bwa_seq_t *s) +{ + if (s->len == s->full_len) return; + if (s->strand == 0) { // forward + if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S + s->cigar[s->n_cigar-1] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar[0] = __cigar_create(0, s->len); + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + } + s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); + } + } else { // reverse + if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S + s->cigar[0] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar[1] = __cigar_create(0, s->len); + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); + } + s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); + } + } + s->len = s->full_len; +} + +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +{ + ubyte_t *pacseq, *ntpac = 0; + int i, j; + kstring_t *str; + + if (ntbns) { // in color space + ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); + rewind(ntbns->fp_pac); + fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); + } + + if (!_pacseq) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = _pacseq; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! + for (j = 0; j < s->n_multi; ++j) { + bwt_multi1_t *q = s->multi + j; + int n_cigar; + if (q->gap == 0) continue; + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, + (q->strand? 1 : -1) * q->gap, &n_cigar, 1); + q->n_cigar = n_cigar; + } + if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, + (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + } +#if 0 + if (ntbns) { // in color space + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + bwa_cs2nt_core(s, bns->l_pac, ntpac); + for (j = 0; j < s->n_multi; ++j) { + bwt_multi1_t *q = s->multi + j; + int n_cigar; + if (q->gap == 0) continue; + free(q->cigar); + q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, + (q->strand? 1 : -1) * q->gap, &n_cigar, 0); + q->n_cigar = n_cigar; + } + if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again + free(s->cigar); + s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, + (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); + } + } + } +#endif + // generate MD tag + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + if (s->type != BWA_TYPE_NO_MATCH) { + int nm; + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, + bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + s->nm = nm; + } + } + free(str->s); free(str); + + // correct for trimmed reads + if (!ntbns) // trimming is only enabled for Illumina reads + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + + if (!_pacseq) free(pacseq); + free(ntpac); +} + +int64_t pos_end(const bwa_seq_t *p) +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = __cigar_op(p->cigar[j]); + if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); + } + return x; + } else return p->pos + p->len; +} + +int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end() +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = __cigar_op(p->cigar[j]); + if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); + } + return x; + } else return p->pos + len; +} + +static int64_t pos_5(const bwa_seq_t *p) +{ + if (p->type != BWA_TYPE_NO_MATCH) + return p->strand? pos_end(p) : p->pos; + return -1; +} + +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) +{ + int j; + if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { + int seqid, nn, am = 0, flag = p->extra_flag; + char XT; + + if (p->type == BWA_TYPE_NO_MATCH) { + p->pos = mate->pos; + p->strand = mate->strand; + flag |= SAM_FSU; + j = 1; + } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment + + // get seqid + nn = bns_cnt_ambi(bns, p->pos, j, &seqid); + if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) + flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences + + // update flag and print it + if (p->strand) flag |= SAM_FSR; + if (mate) { + if (mate->type != BWA_TYPE_NO_MATCH) { + if (mate->strand) flag |= SAM_FMR; + } else flag |= SAM_FMU; + } + err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); + err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); + + // print CIGAR + if (p->cigar) { + for (j = 0; j != p->n_cigar; ++j) + err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); + } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*"); + else err_printf("%dM", p->len); + + // print mate coordinate + if (mate && mate->type != BWA_TYPE_NO_MATCH) { + int m_seqid, m_is_N; + long long isize; + am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality + // redundant calculation here, but should not matter too much + m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); + err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); + isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; + if (p->type == BWA_TYPE_NO_MATCH) isize = 0; + err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); + } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); + else err_printf("\t*\t0\t0\t"); + + // print sequence and quality + if (p->strand == 0) + for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); + else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); + putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + err_printf("%s", p->qual); + } else err_printf("*"); + + if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); + if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); + if (p->type != BWA_TYPE_NO_MATCH) { + int i; + // calculate XT tag + XT = "NURM"[p->type]; + if (nn > 10) XT = 'N'; + // print tags + err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); + if (nn) err_printf("\tXN:i:%d", nn); + if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); + if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment + err_printf("\tX0:i:%d", p->c1); + if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2); + } + err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); + if (p->md) err_printf("\tMD:Z:%s", p->md); + // print multiple hits + if (p->n_multi) { + err_printf("\tXA:Z:"); + for (i = 0; i < p->n_multi; ++i) { + bwt_multi1_t *q = p->multi + i; + int k; + j = pos_end_multi(q, p->len) - q->pos; + nn = bns_cnt_ambi(bns, q->pos, j, &seqid); + err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', + (int)(q->pos - bns->anns[seqid].offset + 1)); + if (q->cigar) { + for (k = 0; k < q->n_cigar; ++k) + err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); + } else err_printf("%dM", p->len); + err_printf(",%d;", q->gap + q->mm); + } + } + } + putchar('\n'); + } else { // this read has no match + ubyte_t *s = p->strand? p->rseq : p->seq; + int flag = p->extra_flag | SAM_FSU; + if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; + err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); + for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); + putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + err_printf("%s", p->qual); + } else err_printf("*"); + if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); + if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); + putchar('\n'); + } +} + +bntseq_t *bwa_open_nt(const char *prefix) +{ + bntseq_t *ntbns; + char *str; + str = (char*)calloc(strlen(prefix) + 10, 1); + strcat(strcpy(str, prefix), ".nt"); + ntbns = bns_restore(str); + free(str); + return ntbns; +} + +void bwa_print_sam_SQ(const bntseq_t *bns) +{ + int i; + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); +} + +void bwase_initialize() +{ + int i; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); +} + +char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +int bwa_set_rg(const char *s) +{ + char *p, *q, *r; + if (strstr(s, "@RG") != s) return -1; + if (bwa_rg_line) free(bwa_rg_line); + if (bwa_rg_id) free(bwa_rg_id); + bwa_rg_line = strdup(s); + bwa_rg_id = 0; + bwa_escape(bwa_rg_line); + p = strstr(bwa_rg_line, "\tID:"); + if (p == 0) return -1; + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + bwa_rg_id = calloc(q - p + 1, 1); + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return 0; +} + +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +{ + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, n_seqs, tot_seqs = 0, m_aln; + bwt_aln1_t *aln = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bntseq_t *bns, *ntbns = 0; + FILE *fp_sa; + gap_opt_t opt; + + // initialization + bwase_initialize(); + bns = bns_restore(prefix); + srand48(bns->seed); + fp_sa = xopen(fn_sa, "r"); + + m_aln = 0; + fread(&opt, sizeof(gap_opt_t), 1, fp_sa); + if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac + ntbns = bwa_open_nt(prefix); + bwa_print_sam_SQ(bns); + //bwa_print_sam_PG(); + // set ks + ks = bwa_open_reads(opt.mode, fn_fa); + // core loop + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + // read alignment + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + int n_aln; + fread(&n_aln, 4, 1, fp_sa); + if (n_aln > m_aln) { + m_aln = n_aln; + aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + } + fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); + } + + fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); + bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); + bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) + bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwa_seq_close(ks); + if (ntbns) bns_destroy(ntbns); + bns_destroy(bns); + fclose(fp_sa); + free(aln); +} + +int bwa_sai2sam_se(int argc, char *argv[]) +{ + extern char *bwa_infer_prefix(const char *hint); + int c, n_occ = 3; + char *prefix; + while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { + switch (c) { + case 'h': break; + case 'r': + if (bwa_set_rg(optarg) < 0) { + fprintf(stderr, "[%s] malformated @RG line\n", __func__); + return 1; + } + break; + case 'n': n_occ = atoi(optarg); break; + case 'f': xreopen(optarg, "w", stdout); break; + default: return 1; + } + } + + if (optind + 3 > argc) { + fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); + return 1; + } + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(bwa_rg_line); free(bwa_rg_id); + return 0; + } + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ); + free(bwa_rg_line); free(bwa_rg_id); + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwase.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwase.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,29 @@ +#ifndef BWASE_H +#define BWASE_H + +#include "bntseq.h" +#include "bwt.h" +#include "bwtaln.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // Initialize mapping tables in the bwa single-end mapper. + void bwase_initialize(); + // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. + void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); + // Refine the approximate position of the sequence to an actual placement for the sequence. + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + // Backfill certain alignment properties mainly centering around number of matches. + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + // Calculate the end position of a read given a certain sequence. + int64_t pos_end(const bwa_seq_t *p); + // + bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); + +#ifdef __cplusplus +} +#endif + +#endif // BWASE_H diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwaseqio.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwaseqio.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,227 @@ +#include +#include +#include "bwtaln.h" +#include "utils.h" +#include "bamlite.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +extern unsigned char nst_nt4_table[256]; +static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +struct __bwa_seqio_t { + // for BAM input + int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE + bamFile fp; + // for fastq input + kseq_t *ks; +}; + +bwa_seqio_t *bwa_bam_open(const char *fn, int which) +{ + bwa_seqio_t *bs; + bam_header_t *h; + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + bs->is_bam = 1; + bs->which = which; + bs->fp = bam_open(fn, "r"); + h = bam_header_read(bs->fp); + bam_header_destroy(h); + return bs; +} + +bwa_seqio_t *bwa_seq_open(const char *fn) +{ + gzFile fp; + bwa_seqio_t *bs; + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + fp = xzopen(fn, "r"); + bs->ks = kseq_init(fp); + return bs; +} + +void bwa_seq_close(bwa_seqio_t *bs) +{ + if (bs == 0) return; + if (bs->is_bam) bam_close(bs->fp); + else { + gzclose(bs->ks->f->f); + kseq_destroy(bs->ks); + } + free(bs); +} + +void seq_reverse(int len, ubyte_t *seq, int is_comp) +{ + int i; + if (is_comp) { + for (i = 0; i < len>>1; ++i) { + char tmp = seq[len-1-i]; + if (tmp < 4) tmp = 3 - tmp; + seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; + seq[i] = tmp; + } + if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; + } else { + for (i = 0; i < len>>1; ++i) { + char tmp = seq[len-1-i]; + seq[len-1-i] = seq[i]; seq[i] = tmp; + } + } +} + +int bwa_trim_read(int trim_qual, bwa_seq_t *p) +{ + int s = 0, l, max = 0, max_l = p->len; + if (trim_qual < 1 || p->qual == 0) return 0; + for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) { + s += trim_qual - (p->qual[l] - 33); + if (s < 0) break; + if (s > max) max = s, max_l = l; + } + p->clip_len = p->len = max_l; + return p->full_len - p->len; +} + +static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) +{ + bwa_seq_t *seqs, *p; + int n_seqs, l, i; + long n_trimmed = 0, n_tot = 0; + bam1_t *b; + + b = bam_init1(); + n_seqs = 0; + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + while (bam_read1(bs->fp, b) >= 0) { + uint8_t *s, *q; + int go = 0; + if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; + if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; + if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; + if (go == 0) continue; + l = b->core.l_qseq; + p = &seqs[n_seqs++]; + p->tid = -1; // no assigned to a thread + p->qual = 0; + p->full_len = p->clip_len = p->len = l; + n_tot += p->full_len; + s = bam1_seq(b); q = bam1_qual(b); + p->seq = (ubyte_t*)calloc(p->len + 1, 1); + p->qual = (ubyte_t*)calloc(p->len + 1, 1); + for (i = 0; i != p->full_len; ++i) { + p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; + p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; + } + if (bam1_strand(b)) { // then reverse + seq_reverse(p->len, p->seq, 1); + seq_reverse(p->len, p->qual, 0); + } + if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); + p->rseq = (ubyte_t*)calloc(p->full_len, 1); + memcpy(p->rseq, p->seq, p->len); + seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() + seq_reverse(p->len, p->rseq, is_comp); + p->name = strdup((const char*)bam1_qname(b)); + if (n_seqs == n_needed) break; + } + *n = n_seqs; + if (n_seqs && trim_qual >= 1) + fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); + if (n_seqs == 0) { + free(seqs); + bam_destroy1(b); + return 0; + } + bam_destroy1(b); + return seqs; +} + +#define BARCODE_LOW_QUAL 13 + +bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) +{ + bwa_seq_t *seqs, *p; + kseq_t *seq = bs->ks; + int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; + long n_trimmed = 0, n_tot = 0; + + if (l_bc > BWA_MAX_BCLEN) { + fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); + return 0; + } + if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input + n_seqs = 0; + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + while ((l = kseq_read(seq)) >= 0) { + if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { + // skip reads that are marked to be filtered by Casava + char *s = index(seq->comment.s, ':'); + if (s && *(++s) == 'Y') { + continue; + } + } + if (is_64 && seq->qual.l) + for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; + if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length + p = &seqs[n_seqs++]; + if (l_bc) { // then trim barcode + for (i = 0; i < l_bc; ++i) + p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); + p->bc[i] = 0; + for (; i < seq->seq.l; ++i) + seq->seq.s[i - l_bc] = seq->seq.s[i]; + seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; + if (seq->qual.l) { + for (i = l_bc; i < seq->qual.l; ++i) + seq->qual.s[i - l_bc] = seq->qual.s[i]; + seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; + } + l = seq->seq.l; + } else p->bc[0] = 0; + p->tid = -1; // no assigned to a thread + p->qual = 0; + p->full_len = p->clip_len = p->len = l; + n_tot += p->full_len; + p->seq = (ubyte_t*)calloc(p->len, 1); + for (i = 0; i != p->full_len; ++i) + p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; + if (seq->qual.l) { // copy quality + p->qual = (ubyte_t*)strdup((char*)seq->qual.s); + if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); + } + p->rseq = (ubyte_t*)calloc(p->full_len, 1); + memcpy(p->rseq, p->seq, p->len); + seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() + seq_reverse(p->len, p->rseq, is_comp); + p->name = strdup((const char*)seq->name.s); + { // trim /[12]$ + int t = strlen(p->name); + if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; + } + if (n_seqs == n_needed) break; + } + *n = n_seqs; + if (n_seqs && trim_qual >= 1) + fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); + if (n_seqs == 0) { + free(seqs); + return 0; + } + return seqs; +} + +void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) +{ + int i, j; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + for (j = 0; j < p->n_multi; ++j) + if (p->multi[j].cigar) free(p->multi[j].cigar); + free(p->name); + free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); + free(p->cigar); + } + free(seqs); +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwt.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,339 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include "utils.h" +#include "bwt.h" +#include "kvec.h" + +void bwt_gen_cnt_table(bwt_t *bwt) +{ + int i, j; + for (i = 0; i != 256; ++i) { + uint32_t x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + bwt->cnt_table[i] = x; + } +} + +// bwt->bwt and bwt->occ must be precalculated +void bwt_cal_sa(bwt_t *bwt, int intv) +{ + bwtint_t isa, sa, i; // S(isa) = sa + int intv_round = intv; + + kv_roundup32(intv_round); + xassert(intv_round == intv, "SA sample interval is not a power of 2."); + xassert(bwt->bwt, "bwt_t::bwt is not initialized."); + + if (bwt->sa) free(bwt->sa); + bwt->sa_intv = intv; + bwt->n_sa = (bwt->seq_len + intv) / intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + if (bwt->sa == 0) { + fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0); + abort(); + } + // calculate SA value + isa = 0; sa = bwt->seq_len; + for (i = 0; i < bwt->seq_len; ++i) { + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + --sa; + isa = bwt_invPsi(bwt, isa); + } + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len +} + +bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) +{ + bwtint_t sa = 0, mask = bwt->sa_intv - 1; + while (k & mask) { + ++sa; + k = bwt_invPsi(bwt, k); + } + /* without setting bwt->sa[0] = -1, the following line should be + changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ + return sa + bwt->sa[k/bwt->sa_intv]; +} + +static inline int __occ_aux(uint64_t y, int c) +{ + // reduce nucleotide counting to bits counting + y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; + // count the number of 1s in y + y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); + return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; +} + +inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) +{ + bwtint_t n, l, j; + uint32_t *p; + + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (bwtint_t)(-1)) return 0; + if (k >= bwt->primary) --k; // because $ is not in bwt + + // retrieve Occ at k/OCC_INTERVAL + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); // jump to the start of the first BWT cell + + // calculate Occ up to the last k/32 + j = k >> 5 << 5; + for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + + // calculate Occ + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + + return n; +} + +// an analogy to bwt_occ() but more efficient, requiring k <= l +inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) +{ + bwtint_t _k, _l; + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + *ok = bwt_occ(bwt, k, c); + *ol = bwt_occ(bwt, l, c); + } else { + bwtint_t m, n, i, j; + uint32_t *p; + if (k >= bwt->primary) --k; + if (l >= bwt->primary) --l; + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); + // calculate *ok + j = k >> 5 << 5; + for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m = n; + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + *ok = n; + // calculate *ol + j = l >> 5 << 5; + for (; i < j; i += 32, p += 2) + m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); + if (c == 0) m -= ~l&31; // corrected for the masked bits + *ol = m; + } +} + +#define __occ_aux4(bwt, b) \ + ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) + +inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +{ + bwtint_t l, j, x; + uint32_t *p; + if (k == (bwtint_t)(-1)) { + memset(cnt, 0, 4 * sizeof(bwtint_t)); + return; + } + if (k >= bwt->primary) --k; // because $ is not in bwt + p = bwt_occ_intv(bwt, k); + memcpy(cnt, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); + j = k >> 4 << 4; + for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) + x += __occ_aux4(bwt, *p); + x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} + +// an analogy to bwt_occ4() but more efficient, requiring k <= l +inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +{ + bwtint_t _k, _l; + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + bwt_occ4(bwt, k, cntk); + bwt_occ4(bwt, l, cntl); + } else { + bwtint_t i, j, x, y; + uint32_t *p; + if (k >= bwt->primary) --k; // because $ is not in bwt + if (l >= bwt->primary) --l; + p = bwt_occ_intv(bwt, k); + memcpy(cntk, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); + // prepare cntk[] + j = k >> 4 << 4; + for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) + x += __occ_aux4(bwt, *p); + y = x; + x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + // calculate cntl[] and finalize cntk[] + j = l >> 4 << 4; + for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); + y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); + cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; + cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; + } +} + +int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) +{ + bwtint_t k, l, ok, ol; + int i; + k = 0; l = bwt->seq_len; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) break; // no match + } + if (k > l) return 0; // no match + if (sa_begin) *sa_begin = k; + if (sa_end) *sa_end = l; + return l - k + 1; +} + +int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) +{ + int i; + bwtint_t k, l, ok, ol; + k = *k0; l = *l0; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // there is an N here. no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) return 0; // no match + } + *k0 = k; *l0 = l; + return l - k + 1; +} + +/********************* + * Bidirectional BWT * + *********************/ + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + for (i = 0; i != 4; ++i) { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; + ok[i].x[2] = tl[i] - tk[i]; + } + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; +} + +static void bwt_reverse_intvs(bwtintv_v *p) +{ + if (p->n > 1) { + int j; + for (j = 0; j < p->n>>1; ++j) { + bwtintv_t tmp = p->a[p->n - 1 - j]; + p->a[p->n - 1 - j] = p->a[j]; + p->a[j] = tmp; + } + } +} + +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + int i, j, c, ret; + bwtintv_t ik, ok[4]; + bwtintv_v a[2], *prev, *curr, *swap; + + mem->n = 0; + if (q[x] > 3) return x + 1; + kv_init(a[0]); kv_init(a[1]); + prev = tmpvec[0]? tmpvec[0] : &a[0]; + curr = tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); + ik.info = x + 1; + + for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search + if (q[i] < 4) { + c = 3 - q[i]; + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] == 0) break; // cannot be extended + ik = ok[c]; ik.info = i + 1; + } else { // an ambiguous base + kv_push(bwtintv_t, *curr, ik); + break; // cannot be extended; in this case, ia[0].info; // this will be the returned value + swap = curr; curr = prev; prev = swap; + + for (i = x - 1; i >= -1; --i) { // backward search for MEMs + if (q[i] > 3) break; + c = i < 0? 0 : q[i]; + for (j = 0, curr->n = 0; j < prev->n; ++j) { + bwtintv_t *p = &prev->a[j]; + bwt_extend(bwt, p, ok, 1); + if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches + ik = *p; ik.info |= (uint64_t)(i + 1)<<32; + kv_push(bwtintv_t, *mem, ik); + } + } // otherwise the match is contained in another longer match + } + if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + ok[c].info = p->info; + kv_push(bwtintv_t, *curr, ok[c]); + } + } + if (curr->n == 0) break; + swap = curr; curr = prev; prev = swap; + } + bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate + + if (tmpvec[0] == 0) free(a[0].a); + if (tmpvec[1] == 0) free(a[1].a); + return ret; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwt.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwt.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,130 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWA_BWT_H +#define BWA_BWT_H + +#include + +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line +#define OCC_INTERVAL 0x80 + +#ifndef BWA_UBYTE +#define BWA_UBYTE +typedef unsigned char ubyte_t; +#endif + +typedef uint64_t bwtint_t; + +typedef struct { + bwtint_t primary; // S^{-1}(0), or the primary index of BWT + bwtint_t L2[5]; // C(), cumulative count + bwtint_t seq_len; // sequence length + bwtint_t bwt_size; // size of bwt, about seq_len/4 + uint32_t *bwt; // BWT + // occurance array, separated to two parts + uint32_t cnt_table[256]; + // suffix array + int sa_intv; + bwtint_t n_sa; + bwtint_t *sa; +} bwt_t; + +typedef struct { + bwtint_t x[3], info; +} bwtintv_t; + +typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; + +/* For general OCC_INTERVAL, the following is correct: +#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) +#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) +*/ + +// The following two lines are ONLY correct when OCC_INTERVAL==0x80 +#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)]) +#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4)) + +/* retrieve a character from the $-removed BWT string. Note that + * bwt_t::bwt is not exactly the BWT string and therefore this macro is + * called bwt_B0 instead of bwt_B */ +#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) + +// inverse Psi function +#define bwt_invPsi(bwt, k) \ + (((k) == (bwt)->primary)? 0 : \ + ((k) < (bwt)->primary)? \ + (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ + : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) + +#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) + +#ifdef __cplusplus +extern "C" { +#endif + + void bwt_dump_bwt(const char *fn, const bwt_t *bwt); + void bwt_dump_sa(const char *fn, const bwt_t *bwt); + + bwt_t *bwt_restore_bwt(const char *fn); + void bwt_restore_sa(const char *fn, bwt_t *bwt); + + void bwt_destroy(bwt_t *bwt); + + void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW + void bwt_cal_sa(bwt_t *bwt, int intv); + + void bwt_bwtupdate_core(bwt_t *bwt); + + inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); + inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); + bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); + + // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values + void bwt_gen_cnt_table(bwt_t *bwt); + inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); + inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); + + int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); + int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + + /** + * Extend bi-SA-interval _ik_ + */ + void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); + + /** + * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. + * Return the end of the longest exact match starting from _x_. + */ + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwt_gen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwt_gen.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,1566 @@ +/* + + BWTConstruct.c BWT-Index Construction + + This module constructs BWT and auxiliary data structures. + + Copyright (C) 2004, Wong Chi Kwong. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +*/ + +#include +#include +#include +#include +#include +#include "QSufSort.h" + +typedef uint64_t bgint_t; +typedef int64_t sbgint_t; + +#define ALPHABET_SIZE 4 +#define BIT_PER_CHAR 2 +#define CHAR_PER_WORD 16 +#define CHAR_PER_BYTE 4 + +#define BITS_IN_WORD 32 +#define BITS_IN_BYTE 8 +#define BYTES_IN_WORD 4 + +#define ALL_ONE_MASK 0xFFFFFFFF +#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 + +#define BITS_PER_OCC_VALUE 16 +#define OCC_VALUE_PER_WORD 2 +#define OCC_INTERVAL 256 +#define OCC_INTERVAL_MAJOR 65536 + +#define TRUE 1 +#define FALSE 0 + +#define BWTINC_INSERT_SORT_NUM_ITEM 7 + +#define MIN_AVAILABLE_WORD 0x10000 + +#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; +#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) +#define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) +#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) + +typedef struct BWT { + bgint_t textLength; // length of the text + bgint_t inverseSa0; // SA-1[0] + bgint_t *cumulativeFreq; // cumulative frequency + unsigned int *bwtCode; // BWT code + unsigned int *occValue; // Occurrence values stored explicitly + bgint_t *occValueMajor; // Occurrence values stored explicitly + unsigned int *decodeTable; // For decoding BWT by table lookup + bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated +} BWT; + +typedef struct BWTInc { + BWT *bwt; + unsigned int numberOfIterationDone; + bgint_t *cumulativeCountInCurrentBuild; + bgint_t availableWord; + bgint_t buildSize; + bgint_t initialMaxBuildSize; + bgint_t incMaxBuildSize; + unsigned int firstCharInLastIteration; + unsigned int *workingMemory; + unsigned int *packedText; + unsigned char *textBuffer; + unsigned int *packedShift; +} BWTInc; + +static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, + unsigned int lastByteLength) +{ + return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; +} + +static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) +{ + bgint_t i; + for (i=0; i>= 2; + } + } + +} +// for BWTIncCreate() +static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar) +{ + bgint_t numOfOccValue; + unsigned numOfOccIntervalPerMajor; + numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; + return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; +} +// for BWTIncCreate() +static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar) +{ + bgint_t numOfOccValue; + numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; +} +// for BWTIncCreate() +static bgint_t BWTResidentSizeInWord(const bgint_t numChar) { + + bgint_t numCharRoundUpToOccInterval; + + // The $ in BWT at the position of inverseSa0 is not encoded + numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; + + return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; + +} + +static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) +{ + bgint_t maxBuildSize; + + if (bwtInc->bwt->textLength == 0) { + // initial build + // Minus 2 because n+1 entries of seq and rank needed for n char + maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4)) + / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); + if (bwtInc->initialMaxBuildSize > 0) { + bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); + } else { + bwtInc->buildSize = maxBuildSize; + } + } else { + // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char + // Minus numberOfIterationDone because bwt slightly shift to left in each iteration + maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord + - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) + / 3 / (sizeof(bgint_t) / 4); + if (maxBuildSize < CHAR_PER_WORD) { + fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); + exit(1); + } + if (bwtInc->incMaxBuildSize > 0) { + bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); + } else { + bwtInc->buildSize = maxBuildSize; + } + if (bwtInc->buildSize < CHAR_PER_WORD) + bwtInc->buildSize = CHAR_PER_WORD; + } + + if (bwtInc->buildSize < CHAR_PER_WORD) { + fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); + exit(1); + } + + bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; + + bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); + bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); +} + +// for ceilLog2() +unsigned int leadingZero(const unsigned int input) +{ + unsigned int l; + const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + if (input & 0xFFFF0000) { + if (input & 0xFF000000) { + l = leadingZero8bit[input >> 24]; + } else { + l = 8 + leadingZero8bit[input >> 16]; + } + } else { + if (input & 0x0000FF00) { + l = 16 + leadingZero8bit[input >> 8]; + } else { + l = 24 + leadingZero8bit[input]; + } + } + return l; + +} +// for BitPerBytePackedChar() +static unsigned int ceilLog2(const unsigned int input) +{ + if (input <= 1) return 0; + return BITS_IN_WORD - leadingZero(input - 1); + +} +// for ConvertBytePackedToWordPacked() +static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) +{ + unsigned int bitPerChar; + bitPerChar = ceilLog2(alphabetSize); + // Return the largest number of bit that does not affect packing efficiency + if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) + bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); + return bitPerChar; +} +// for ConvertBytePackedToWordPacked() +static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) +{ + return ceilLog2(alphabetSize); +} + +static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, + const bgint_t textLength) +{ + bgint_t i; + unsigned int j, k, c; + unsigned int bitPerBytePackedChar; + unsigned int bitPerWordPackedChar; + unsigned int charPerWord; + unsigned int charPerByte; + unsigned int bytePerIteration; + bgint_t byteProcessed = 0; + bgint_t wordProcessed = 0; + unsigned int mask, shift; + + unsigned int buffer[BITS_IN_WORD]; + + bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); + bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); + charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; + charPerWord = BITS_IN_WORD / bitPerWordPackedChar; + + bytePerIteration = charPerWord / charPerByte; + mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); + shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; + + while ((wordProcessed + 1) * charPerWord < textLength) { + + k = 0; + for (i=0; i> bitPerWordPackedChar * i; + } + output[wordProcessed] = c; + wordProcessed++; + + } + + k = 0; + for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { + c = (unsigned int)input[byteProcessed] << shift; + for (j=0; j> bitPerWordPackedChar * i; + } + output[wordProcessed] = c; +} + +BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) +{ + BWT *bwt; + + bwt = (BWT*)calloc(1, sizeof(BWT)); + + bwt->textLength = 0; + + bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); + + bwt->bwtSizeInWord = 0; + + // Generate decode tables + if (decodeTable == NULL) { + bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); + GenerateDNAOccCountTable(bwt->decodeTable); + } else { + bwt->decodeTable = decodeTable; + } + + bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); + bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); + + bwt->occSizeInWord = 0; + bwt->occValue = NULL; + + return bwt; +} + +BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize) +{ + BWTInc *bwtInc; + unsigned int i, n_iter; + + if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; + if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; + + bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); + bwtInc->numberOfIterationDone = 0; + bwtInc->bwt = BWTCreate(textLength, NULL); + bwtInc->initialMaxBuildSize = initialMaxBuildSize; + bwtInc->incMaxBuildSize = incMaxBuildSize; + bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + + // Build frequently accessed data + bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); + for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; + + n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1; + bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement + + OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array + + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration + if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small + fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); + bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); + + return bwtInc; +} +// for BWTIncConstruct() +static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, + bgint_t* __restrict cumulativeCount, const bgint_t numChar) +{ + bgint_t i; + unsigned int j; + unsigned int c, t; + unsigned int packedMask; + bgint_t rankIndex; + bgint_t lastWord; + unsigned int numCharInLastWord; + + lastWord = (numChar - 1) / CHAR_PER_WORD; + numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; + + packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); + rankIndex = numChar - 1; + + t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); + for (i=0; i>= BIT_PER_CHAR; + } + + for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 + t = packedText[i]; + for (j=0; j>= BIT_PER_CHAR; + } + } + + // Convert occurrence to cumulativeCount + cumulativeCount[2] += cumulativeCount[1]; + cumulativeCount[3] += cumulativeCount[2]; + cumulativeCount[4] += cumulativeCount[3]; +} + + +static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, + bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, + 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, + 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, + 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; + + bgint_t iteration, i; + unsigned int wordToCount, charToCount; + unsigned int j, c, sum; + + occCount[0] = 0; + occCount[1] = 0; + occCount[2] = 0; + occCount[3] = 0; + + iteration = index / 256; + wordToCount = (index - iteration * 256) / 16; + charToCount = index - iteration * 256 - wordToCount * 16; + + for (i=0; i> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + dna++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + occCount[0] += sum & 0x000000FF; sum >>= 8; + occCount[1] += sum & 0x000000FF; sum >>= 8; + occCount[2] += sum & 0x000000FF; sum >>= 8; + occCount[3] += sum; + } else { + // only some or all of the 3 bits are on + // in reality, only one of the four cases are possible + if (sum == 0x00000100) { + occCount[0] += 256; + } else if (sum == 0x00010000) { + occCount[1] += 256; + } else if (sum == 0x01000000) { + occCount[2] += 256; + } else if (sum == 0x00000000) { + occCount[3] += 256; + } else { + fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); + exit(1); + } + } + + } + + sum = 0; + for (j=0; j> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + dna++; + } + + if (charToCount > 0) { + c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + occCount[0] += sum & 0x000000FF; sum >>= 8; + occCount[1] += sum & 0x000000FF; sum >>= 8; + occCount[2] += sum & 0x000000FF; sum >>= 8; + occCount[3] += sum; +} + +static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, + const bgint_t *cumulativeCount, const unsigned int *packedShift) { + + bgint_t i, r; + unsigned int c; + bgint_t previousRank, currentRank; + bgint_t wordIndex, charIndex; + bgint_t inverseSa0; + + inverseSa0 = previousRank = relativeRank[0]; + + for (i=1; i<=numChar; i++) { + currentRank = relativeRank[i]; + // previousRank > cumulativeCount[c] because $ is one of the char + c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) + + (previousRank > cumulativeCount[3]); + // set bwt for currentRank + if (c > 0) { + // c <> 'a' + r = currentRank; + if (r > inverseSa0) { + // - 1 because $ at inverseSa0 is not encoded + r--; + } + wordIndex = r / CHAR_PER_WORD; + charIndex = r - wordIndex * CHAR_PER_WORD; + bwt[wordIndex] |= c << packedShift[charIndex]; + } + previousRank = currentRank; + } +} + +static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, + const unsigned int character) +{ + bgint_t occIndexMajor; + + occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; + + if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { + return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); + + } else { + return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); + } +} + + +static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, + const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, + 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, + 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, + 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; + + unsigned int wordToCount, charToCount; + unsigned int i, c; + unsigned int sum = 0; + + wordToCount = index / 16; + charToCount = index - wordToCount * 16; + + for (i=0; i> 16]; + sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; + } + + if (charToCount > 0) { + c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + return (sum >> (character * 8)) & 0x000000FF; + +} + +static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, + const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, + 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, + 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, + 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; + + unsigned int wordToCount, charToCount; + unsigned int i, c; + unsigned int sum = 0; + + wordToCount = index / 16; + charToCount = index - wordToCount * 16; + + dna -= wordToCount + 1; + + if (charToCount > 0) { + c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + for (i=0; i> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + } + + return (sum >> (character * 8)) & 0x000000FF; + +} + +bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) +{ + bgint_t occValue; + bgint_t occExplicitIndex, occIndex; + + // $ is supposed to be positioned at inverseSa0 but it is not encoded + // therefore index is subtracted by 1 for adjustment + if (index > bwt->inverseSa0) + index--; + + occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding + occIndex = occExplicitIndex * OCC_INTERVAL; + occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); + + if (occIndex == index) + return occValue; + + if (occIndex < index) { + return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); + } else { + return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); + } +} + +static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, + const unsigned int *packedText, const bgint_t numChar, + const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) +{ + bgint_t saIndex; + bgint_t lastWord; + unsigned int packedMask; + bgint_t i; + unsigned int c, t, j; + bgint_t rankIndex; + unsigned int shift; + bgint_t seqIndexFromStart[ALPHABET_SIZE]; + bgint_t seqIndexFromEnd[ALPHABET_SIZE]; + + for (i=0; i> shift; + saIndex = bwt->inverseSa0; + rankIndex = numChar - 1; + + lastWord = numChar / CHAR_PER_WORD; + for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 + t = packedText[i]; + for (j=0; jcumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; + // A counting sort using the first character of suffix is done here + // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 + if (saIndex > bwt->inverseSa0) { + seq[seqIndexFromEnd[c]] = rankIndex; + absoluteRank[seqIndexFromEnd[c]] = saIndex; + seqIndexFromEnd[c]--; + } else { + seq[seqIndexFromStart[c]] = rankIndex; + absoluteRank[seqIndexFromStart[c]] = saIndex; + seqIndexFromStart[c]++; + } + rankIndex--; + t >>= BIT_PER_CHAR; + } + } + + absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters + seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; + + return seqIndexFromStart[firstCharInLastIteration]; +} + +static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem) +{ + #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD + + int64_t lowIndex, highIndex, midIndex; + int64_t lowPartitionIndex, highPartitionIndex; + int64_t lowStack[32], highStack[32]; + int stackDepth; + int64_t i, j; + bgint_t tempSeq, tempKey; + int64_t numberOfEqualKey; + + if (numItem < 2) return; + + stackDepth = 0; + + lowIndex = 0; + highIndex = numItem - 1; + + for (;;) { + + for (;;) { + + // Sort small array of data + if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays + for (i=lowIndex+1; i<=highIndex; i++) { + tempSeq = seq[i]; + tempKey = key[i]; + for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { + seq[j] = seq[j-1]; + key[j] = key[j-1]; + } + if (j != i) { + seq[j] = tempSeq; + key[j] = tempKey; + } + } + break; + } + + // Choose pivot as median of the lowest, middle, and highest data; sort the three data + + midIndex = average(lowIndex, highIndex); + if (key[lowIndex] > key[midIndex]) { + tempSeq = seq[lowIndex]; + tempKey = key[lowIndex]; + seq[lowIndex] = seq[midIndex]; + key[lowIndex] = key[midIndex]; + seq[midIndex] = tempSeq; + key[midIndex] = tempKey; + } + if (key[lowIndex] > key[highIndex]) { + tempSeq = seq[lowIndex]; + tempKey = key[lowIndex]; + seq[lowIndex] = seq[highIndex]; + key[lowIndex] = key[highIndex]; + seq[highIndex] = tempSeq; + key[highIndex] = tempKey; + } + if (key[midIndex] > key[highIndex]) { + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[highIndex]; + key[midIndex] = key[highIndex]; + seq[highIndex] = tempSeq; + key[highIndex] = tempKey; + } + + // Partition data + + numberOfEqualKey = 0; + + lowPartitionIndex = lowIndex + 1; + highPartitionIndex = highIndex - 1; + + for (;;) { + while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { + numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); + lowPartitionIndex++; + } + while (lowPartitionIndex < highPartitionIndex) { + if (key[midIndex] >= key[highPartitionIndex]) { + numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); + break; + } + highPartitionIndex--; + } + if (lowPartitionIndex >= highPartitionIndex) { + break; + } + tempSeq = seq[lowPartitionIndex]; + tempKey = key[lowPartitionIndex]; + seq[lowPartitionIndex] = seq[highPartitionIndex]; + key[lowPartitionIndex] = key[highPartitionIndex]; + seq[highPartitionIndex] = tempSeq; + key[highPartitionIndex] = tempKey; + if (highPartitionIndex == midIndex) { + // partition key has been moved + midIndex = lowPartitionIndex; + } + lowPartitionIndex++; + highPartitionIndex--; + } + + // Adjust the partition index + highPartitionIndex = lowPartitionIndex; + lowPartitionIndex--; + + // move the partition key to end of low partition + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[lowPartitionIndex]; + key[midIndex] = key[lowPartitionIndex]; + seq[lowPartitionIndex] = tempSeq; + key[lowPartitionIndex] = tempKey; + + if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { + + // Many keys = partition key; separate the equal key data from the lower partition + + midIndex = lowIndex; + + for (;;) { + while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { + midIndex++; + } + while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { + lowPartitionIndex--; + } + if (midIndex >= lowPartitionIndex) { + break; + } + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[lowPartitionIndex - 1]; + key[midIndex] = key[lowPartitionIndex - 1]; + seq[lowPartitionIndex - 1] = tempSeq; + key[lowPartitionIndex - 1] = tempKey; + midIndex++; + lowPartitionIndex--; + } + + } + + if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { + // put the larger partition to stack + lowStack[stackDepth] = lowIndex; + highStack[stackDepth] = lowPartitionIndex - 1; + stackDepth++; + // sort the smaller partition first + lowIndex = highPartitionIndex; + } else { + // put the larger partition to stack + lowStack[stackDepth] = highPartitionIndex; + highStack[stackDepth] = highIndex; + stackDepth++; + // sort the smaller partition first + if (lowPartitionIndex > lowIndex) { + highIndex = lowPartitionIndex - 1; + } else { + // all keys in the partition equals to the partition key + break; + } + } + continue; + } + + // Pop a range from stack + if (stackDepth > 0) { + stackDepth--; + lowIndex = lowStack[stackDepth]; + highIndex = highStack[stackDepth]; + continue; + } else return; + } +} + + +static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq, + bgint_t* __restrict relativeRank, const bgint_t numItem, + bgint_t oldInverseSa0, const bgint_t *cumulativeCount) +{ + bgint_t i, c; + bgint_t s, r; + bgint_t lastRank, lastIndex; + bgint_t oldInverseSa0RelativeRank = 0; + bgint_t freq; + + lastIndex = numItem; + lastRank = sortedRank[numItem]; + if (lastRank > oldInverseSa0) { + sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt + } + s = seq[numItem]; + relativeRank[s] = numItem; + if (lastRank == oldInverseSa0) { + oldInverseSa0RelativeRank = numItem; + oldInverseSa0++; // so that this segment of code is not run again + lastRank++; // so that oldInverseSa0 become a sorted group with 1 item + } + + c = ALPHABET_SIZE - 1; + freq = cumulativeCount[c]; + + for (i=numItem; i--;) { // from numItem - 1 to 0 + r = sortedRank[i]; + if (r > oldInverseSa0) + sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt + s = seq[i]; + if (i < freq) { + if (lastIndex >= freq) + lastRank++; // to trigger the group across alphabet boundary to be split + c--; + freq = cumulativeCount[c]; + } + if (r == lastRank) { + relativeRank[s] = lastIndex; + } else { + if (i == lastIndex - 1) { + if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { + seq[lastIndex] = seq[lastIndex + 1] - 1; + } else { + seq[lastIndex] = (bgint_t)-1; + } + } + lastIndex = i; + lastRank = r; + relativeRank[s] = i; + if (r == oldInverseSa0) { + oldInverseSa0RelativeRank = i; + oldInverseSa0++; // so that this segment of code is not run again + lastRank++; // so that oldInverseSa0 become a sorted group with 1 item + } + } + } + +} + +static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, + const bgint_t *cumulativeCount) +{ + unsigned int c; + bgint_t i; + bgint_t previousRank, currentRank; + + previousRank = relativeRank[0]; + + for (i=1; i<=numChar; i++) { + currentRank = relativeRank[i]; + c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + + (previousRank >= cumulativeCount[3]); + insertBwt[currentRank] = c; + previousRank = currentRank; + } +} + +static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, + unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) +{ + unsigned int bitsInWordMinusBitPerChar; + bgint_t leftShift, rightShift; + bgint_t o; + bgint_t oIndex, iIndex, mIndex; + bgint_t mWord, mChar, oWord, oChar; + bgint_t numInsert; + + bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; + + oIndex = 0; + iIndex = 0; + mIndex = 0; + + mWord = 0; + mChar = 0; + + mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration + + while (oIndex < numOldBwt) { + + // copy from insertBwt + while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { + if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 + mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); + mIndex++; + mChar++; + if (mChar == CHAR_PER_WORD) { + mChar = 0; + mWord++; + mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary + } + } + iIndex++; + } + + // Copy from oldBwt to mergedBwt + if (iIndex <= numInsertBwt) { + o = sortedRank[iIndex]; + } else { + o = numOldBwt; + } + numInsert = o - oIndex; + + oWord = oIndex / CHAR_PER_WORD; + oChar = oIndex - oWord * CHAR_PER_WORD; + if (oChar > mChar) { + leftShift = (oChar - mChar) * BIT_PER_CHAR; + rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; + mergedBwt[mWord] = mergedBwt[mWord] + | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) + | (oldBwt[oWord+1] >> rightShift); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); + oIndex += CHAR_PER_WORD; + } + } else if (oChar < mChar) { + rightShift = (mChar - oChar) * BIT_PER_CHAR; + leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; + mergedBwt[mWord] = mergedBwt[mWord] + | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); + oIndex += CHAR_PER_WORD; + } + } else { // oChar == mChar + mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = oldBwt[oWord]; + oIndex += CHAR_PER_WORD; + } + } + oIndex = o; + mIndex += numInsert; + + // Clear the trailing garbage in mergedBwt + mWord = mIndex / CHAR_PER_WORD; + mChar = mIndex - mWord * CHAR_PER_WORD; + if (mChar == 0) { + mergedBwt[mWord] = 0; + } else { + mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); + } + + } + + // copy from insertBwt + while (iIndex <= numInsertBwt) { + if (sortedRank[iIndex] != 0) { + mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); + mIndex++; + mChar++; + if (mChar == CHAR_PER_WORD) { + mChar = 0; + mWord++; + mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary + } + } + iIndex++; + } +} + +void BWTClearTrailingBwtCode(BWT *bwt) +{ + bgint_t bwtResidentSizeInWord; + bgint_t wordIndex, offset; + bgint_t i; + + bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); + + wordIndex = bwt->textLength / CHAR_PER_WORD; + offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; + if (offset > 0) { + bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); + } else { + if (wordIndex < bwtResidentSizeInWord) { + bwt->bwtCode[wordIndex] = 0; + } + } + + for (i=wordIndex+1; ibwtCode[i] = 0; + } +} + + +void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, + bgint_t* __restrict occValueMajor, + const bgint_t textLength, const unsigned int* decodeTable) +{ + bgint_t numberOfOccValueMajor, numberOfOccValue; + unsigned int wordBetweenOccValue; + bgint_t numberOfOccIntervalPerMajor; + unsigned int c; + bgint_t i, j; + bgint_t occMajorIndex; + bgint_t occIndex, bwtIndex; + bgint_t sum; // perhaps unsigned is big enough + bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; + + wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; + + // Calculate occValue + numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; + numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; + + tempOccValue0[0] = 0; + tempOccValue0[1] = 0; + tempOccValue0[2] = 0; + tempOccValue0[3] = 0; + occValueMajor[0] = 0; + occValueMajor[1] = 0; + occValueMajor[2] = 0; + occValueMajor[3] = 0; + + occIndex = 0; + bwtIndex = 0; + for (occMajorIndex=1; occMajorIndex> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + tempOccValue0[0] = tempOccValue1[0]; + tempOccValue0[1] = tempOccValue1[1]; + tempOccValue0[2] = tempOccValue1[2]; + tempOccValue0[3] = tempOccValue1[3]; + sum = 0; + + occIndex++; + + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue0[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue0[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue0[2] += 256; + } else { + tempOccValue0[3] += 256; + } + } + } + + occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; + occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; + occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; + occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; + tempOccValue0[0] = 0; + tempOccValue0[1] = 0; + tempOccValue0[2] = 0; + tempOccValue0[3] = 0; + + } + + while (occIndex < (numberOfOccValue-1)/2) { + sum = 0; + tempOccValue1[0] = tempOccValue0[0]; + tempOccValue1[1] = tempOccValue0[1]; + tempOccValue1[2] = tempOccValue0[2]; + tempOccValue1[3] = tempOccValue0[3]; + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + tempOccValue0[0] = tempOccValue1[0]; + tempOccValue0[1] = tempOccValue1[1]; + tempOccValue0[2] = tempOccValue1[2]; + tempOccValue0[3] = tempOccValue1[3]; + sum = 0; + occIndex++; + + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue0[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue0[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue0[2] += 256; + } else { + tempOccValue0[3] += 256; + } + } + } + + sum = 0; + tempOccValue1[0] = tempOccValue0[0]; + tempOccValue1[1] = tempOccValue0[1]; + tempOccValue1[2] = tempOccValue0[2]; + tempOccValue1[3] = tempOccValue0[3]; + + if (occIndex * 2 < numberOfOccValue - 1) { + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + } + + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + +} + +static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) +{ + unsigned int i; + bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; + unsigned int firstCharInThisIteration; + + bgint_t *relativeRank, *seq, *sortedRank; + unsigned int *insertBwt, *mergedBwt; + bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; + + mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); + mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); + + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + + if (bwtInc->bwt->textLength == 0) { // Initial build + + // Set address + seq = (bgint_t*)bwtInc->workingMemory; + relativeRank = seq + bwtInc->buildSize + 1; + // mergedBwt and packedTex may share memory + mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place + + assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText); + assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt); + + // ->packedText is not used any more and may be overwritten by mergedBwt + BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); + + firstCharInThisIteration = relativeRank[0]; + relativeRank[numChar] = 0; + + // Sort suffix + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); + newInverseSa0 = relativeRank[0]; + + // Clear BWT area + initializeVAL(insertBwt, mergedBwtSizeInWord, 0); + + // Build BWT + BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); + + // so that the cumulativeCount is not deducted + bwtInc->firstCharInLastIteration = ALPHABET_SIZE; + + } else { // Incremental build + // Set address + sortedRank = (bgint_t*)bwtInc->workingMemory; + seq = sortedRank + bwtInc->buildSize + 1; + insertBwt = (unsigned*)seq; // insertBwt and seq share memory + // relativeRank and ->packedText may share memory + relativeRank = seq + bwtInc->buildSize + 1; + + assert((void*)relativeRank <= (void*)bwtInc->packedText); + + // Store the first character of this iteration + firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); + + // Count occurrence of input text + ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); + // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration + bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; + bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; + bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; + bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; + + // Get rank of new suffix among processed suffix + // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group + // ->packedText is not used any more and will be overwritten by relativeRank + oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, + numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); + + // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) + for (i=0; icumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || + bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { + BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); + } else { + if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { + BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); + } + if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { + BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); + } + } + } + + // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt + // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) + BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); + assert(relativeRank[numChar] == oldInverseSa0RelativeRank); + + // Sort suffix + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); + + newInverseSa0RelativeRank = relativeRank[0]; + newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; + + sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt + + // Build BWT; seq is overwritten by insertBwt + BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); + + // Merge BWT; relativeRank may be overwritten by mergedBwt + mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord + - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging + assert(mergedBwt >= insertBwt + numChar); + BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); + } + + // Build auxiliary structure and update info and pointers in BWT + bwtInc->bwt->textLength += numChar; + bwtInc->bwt->bwtCode = mergedBwt; + bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; + bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; + assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); + + bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; + + BWTClearTrailingBwtCode(bwtInc->bwt); + BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, + bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); + + bwtInc->bwt->inverseSa0 = newInverseSa0; + + bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); + bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); + bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); + bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); + + bwtInc->firstCharInLastIteration = firstCharInThisIteration; + + // Set build size and text address for the next build + BWTIncSetBuildSizeAndTextAddr(bwtInc); + bwtInc->numberOfIterationDone++; + +} + +BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) +{ + + FILE *packedFile; + bgint_t packedFileLen; + bgint_t totalTextLength; + bgint_t textToLoad, textSizeInByte; + bgint_t processedTextLength; + unsigned char lastByteLength; + + BWTInc *bwtInc; + + packedFile = (FILE*)fopen(inputFileName, "rb"); + + if (packedFile == NULL) { + fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); + exit(1); + } + + fseek(packedFile, -1, SEEK_END); + packedFileLen = ftell(packedFile); + fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); + totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); + + bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); + + BWTIncSetBuildSizeAndTextAddr(bwtInc); + + if (bwtInc->buildSize > totalTextLength) { + textToLoad = totalTextLength; + } else { + textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); + } + textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte + + fseek(packedFile, -2, SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); + fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); + + ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); + BWTIncConstruct(bwtInc, textToLoad); + + processedTextLength = textToLoad; + + while (processedTextLength < totalTextLength) { + textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; + if (textToLoad > totalTextLength - processedTextLength) { + textToLoad = totalTextLength - processedTextLength; + } + textSizeInByte = textToLoad / CHAR_PER_BYTE; + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); + BWTIncConstruct(bwtInc, textToLoad); + processedTextLength += textToLoad; + if (bwtInc->numberOfIterationDone % 10 == 0) { + fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", + (long)bwtInc->numberOfIterationDone, (long)processedTextLength); + } + } + return bwtInc; +} + +void BWTFree(BWT *bwt) +{ + if (bwt == 0) return; + free(bwt->cumulativeFreq); + free(bwt->bwtCode); + free(bwt->occValue); + free(bwt->occValueMajor); + free(bwt->decodeTable); + free(bwt); +} + +void BWTIncFree(BWTInc *bwtInc) +{ + if (bwtInc == 0) return; + free(bwtInc->bwt); + free(bwtInc->workingMemory); + free(bwtInc); +} + +static bgint_t BWTFileSizeInWord(const bgint_t numChar) +{ + // The $ in BWT at the position of inverseSa0 is not encoded + return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; +} + +void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) +{ + FILE *bwtFile; +/* FILE *occValueFile; */ + bgint_t bwtLength; + + bwtFile = (FILE*)fopen(bwtFileName, "wb"); + if (bwtFile == NULL) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); + exit(1); + } + + fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); + fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); + bwtLength = BWTFileSizeInWord(bwt->textLength); + fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); + fclose(bwtFile); +} + +void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) +{ + BWTInc *bwtInc; + bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000); + printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); + BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); + BWTIncFree(bwtInc); +} + +int bwt_bwtgen_main(int argc, char *argv[]) +{ + if (argc < 3) { + fprintf(stderr, "Usage: bwtgen \n"); + return 1; + } + bwt_bwtgen(argv[1], argv[2]); + return 0; +} + +#ifdef MAIN_BWT_GEN + +int main(int argc, char *argv[]) +{ + return bwt_bwtgen_main(argc, argv); +} + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwt_lite.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwt_lite.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,94 @@ +#include +#include +#include +#include "bwt_lite.h" + +int is_sa(const uint8_t *T, uint32_t *SA, int n); +int is_bwt(uint8_t *T, int n); + +bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) +{ + bwtl_t *b; + int i; + b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); + b->seq_len = len; + + { // calculate b->bwt + uint8_t *s; + b->sa = (uint32_t*)calloc(len + 1, 4); + is_sa(seq, b->sa, len); + s = (uint8_t*)calloc(len + 1, 1); + for (i = 0; i <= len; ++i) { + if (b->sa[i] == 0) b->primary = i; + else s[i] = seq[b->sa[i] - 1]; + } + for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; + b->bwt_size = (len + 15) / 16; + b->bwt = (uint32_t*)calloc(b->bwt_size, 4); + for (i = 0; i < len; ++i) + b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); + free(s); + } + { // calculate b->occ + uint32_t c[4]; + b->n_occ = (len + 15) / 16 * 4; + b->occ = (uint32_t*)calloc(b->n_occ, 4); + memset(c, 0, 16); + for (i = 0; i < len; ++i) { + if (i % 16 == 0) + memcpy(b->occ + (i/16) * 4, c, 16); + ++c[bwtl_B0(b, i)]; + } + memcpy(b->L2+1, c, 16); + for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1]; + } + { // generate cnt_table + for (i = 0; i != 256; ++i) { + u_int32_t j, x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + b->cnt_table[i] = x; + } + } + return b; +} +inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) +{ + uint32_t n, b; + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (uint32_t)(-1)) return 0; + if (k >= bwt->primary) --k; // because $ is not in bwt + n = bwt->occ[k/16<<2|c]; + b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1); + n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff; + if (c == 0) n -= 15 - (k&15); // corrected for the masked bits + return n; +} +inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) +{ + uint32_t x, b; + if (k == (uint32_t)(-1)) { + memset(cnt, 0, 16); + return; + } + if (k >= bwt->primary) --k; // because $ is not in bwt + memcpy(cnt, bwt->occ + (k>>4<<2), 16); + b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1); + x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]; + x -= 15 - (k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} +inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) +{ + bwtl_occ4(bwt, k, cntk); + bwtl_occ4(bwt, l, cntl); +} +void bwtl_destroy(bwtl_t *bwt) +{ + if (bwt) { + free(bwt->occ); free(bwt->bwt); free(bwt->sa); + free(bwt); + } +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwt_lite.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwt_lite.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,29 @@ +#ifndef BWT_LITE_H_ +#define BWT_LITE_H_ + +#include + +typedef struct { + uint32_t seq_len, bwt_size, n_occ; + uint32_t primary; + uint32_t *bwt, *occ, *sa, L2[5]; + uint32_t cnt_table[256]; +} bwtl_t; + +#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +#ifdef __cplusplus +extern "C" { +#endif + + bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); + inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); + inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); + inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); + void bwtl_destroy(bwtl_t *bwt); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtaln.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtaln.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,356 @@ +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "bwtaln.h" +#include "bwtgap.h" +#include "utils.h" + +#ifdef HAVE_PTHREAD +#include +#endif + +gap_opt_t *gap_init_opt() +{ + gap_opt_t *o; + o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); + /* IMPORTANT: s_mm*10 should be about the average base error + rate. Voilating this requirement will break pairing! */ + o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; + o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; + o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; + o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; + o->seed_len = 32; o->max_seed_diff = 2; + o->fnr = 0.04; + o->n_threads = 1; + o->max_top2 = 30; + o->trim_qual = 0; + return o; +} + +int bwa_cal_maxdiff(int l, double err, double thres) +{ + double elambda = exp(-l * err); + double sum, y = 1.0; + int k, x = 1; + for (k = 1, sum = elambda; k < 1000; ++k) { + y *= l * err; + x *= k; + sum += elambda * y / x; + if (1.0 - sum < thres) return k; + } + return 2; +} + +// width must be filled as zero +int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) +{ + bwtint_t k, l, ok, ol; + int i, bid; + bid = 0; + k = 0; l = bwt->seq_len; + for (i = 0; i < len; ++i) { + ubyte_t c = str[i]; + if (c < 4) { + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + } + if (k > l || c > 3) { // then restart + k = 0; + l = bwt->seq_len; + ++bid; + } + width[i].w = l - k + 1; + width[i].bid = bid; + } + width[len].w = 0; + width[len].bid = ++bid; + return bid; +} + +void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) +{ + int i, j, max_l = 0, max_len; + gap_stack_t *stack; + bwt_width_t *w, *seed_w; + gap_opt_t local_opt = *opt; + + // initiate priority stack + for (i = max_len = 0; i != n_seqs; ++i) + if (seqs[i].len > max_len) max_len = seqs[i].len; + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); + if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; + stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); + + seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; +#ifdef HAVE_PTHREAD + if (i % opt->n_threads != tid) continue; +#endif + p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; + if (max_l < p->len) { + max_l = p->len; + w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); + memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); + } + bwt_cal_width(bwt, p->len, p->seq, w); + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); + local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; + if (p->len > opt->seed_len) + bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); + // core function + for (j = 0; j < p->len; ++j) // we need to complement + p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; + p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + // clean up the unused data in the record + free(p->name); free(p->seq); free(p->rseq); free(p->qual); + p->name = 0; p->seq = p->rseq = p->qual = 0; + } + free(seed_w); free(w); + gap_destroy_stack(stack); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid; + bwt_t *bwt; + int n_seqs; + bwa_seq_t *seqs; + const gap_opt_t *opt; +} thread_aux_t; + +static void *worker(void *data) +{ + thread_aux_t *d = (thread_aux_t*)data; + bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); + return 0; +} +#endif + +bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) +{ + bwa_seqio_t *ks; + if (mode & BWA_MODE_BAM) { // open BAM + int which = 0; + if (mode & BWA_MODE_BAM_SE) which |= 4; + if (mode & BWA_MODE_BAM_READ1) which |= 1; + if (mode & BWA_MODE_BAM_READ2) which |= 2; + if (which == 0) which = 7; // then read all reads + ks = bwa_bam_open(fn_fa, which); + } else ks = bwa_seq_open(fn_fa); + return ks; +} + +void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) +{ + int i, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bwt_t *bwt; + + // initialization + ks = bwa_open_reads(opt->mode, fn_fa); + + { // load BWT + char *str = (char*)calloc(strlen(prefix) + 10, 1); + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + free(str); + } + + // core loop + err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { // no multi-threading at all + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + data[j].tid = j; data[j].bwt = bwt; + data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; + pthread_create(&tid[j], &attr, worker, data + j); + } + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + free(data); free(tid); + } +#else + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); +#endif + + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + t = clock(); + fprintf(stderr, "[bwa_aln_core] write to the disk... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + err_fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwt_destroy(bwt); + bwa_seq_close(ks); +} + +char *bwa_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = malloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + fclose(fp); + prefix[l_hint + 3] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + fclose(fp); + prefix[l_hint] = 0; + return prefix; + } + } +} + +int bwa_aln(int argc, char *argv[]) +{ + int c, opte = -1; + gap_opt_t *opt; + char *prefix; + + opt = gap_init_opt(); + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + switch (c) { + case 'n': + if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; + else opt->max_diff = atoi(optarg), opt->fnr = -1.0; + break; + case 'o': opt->max_gapo = atoi(optarg); break; + case 'e': opte = atoi(optarg); break; + case 'M': opt->s_mm = atoi(optarg); break; + case 'O': opt->s_gapo = atoi(optarg); break; + case 'E': opt->s_gape = atoi(optarg); break; + case 'd': opt->max_del_occ = atoi(optarg); break; + case 'i': opt->indel_end_skip = atoi(optarg); break; + case 'l': opt->seed_len = atoi(optarg); break; + case 'k': opt->max_seed_diff = atoi(optarg); break; + case 'm': opt->max_entries = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'L': opt->mode |= BWA_MODE_LOGGAP; break; + case 'R': opt->max_top2 = atoi(optarg); break; + case 'q': opt->trim_qual = atoi(optarg); break; + case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; + case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; + case 'f': xreopen(optarg, "wb", stdout); break; + case 'b': opt->mode |= BWA_MODE_BAM; break; + case '0': opt->mode |= BWA_MODE_BAM_SE; break; + case '1': opt->mode |= BWA_MODE_BAM_READ1; break; + case '2': opt->mode |= BWA_MODE_BAM_READ2; break; + case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; + case 'B': opt->mode |= atoi(optarg) << 24; break; + default: return 1; + } + } + if (opte > 0) { + opt->max_gape = opte; + opt->mode &= ~BWA_MODE_GAPE; + } + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa aln [options] \n\n"); + fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", + BWA_AVG_ERR, opt->fnr); + fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); + fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); + fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); + fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); + fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); + fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); + fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); + fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); + fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); + fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); + fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); + fprintf(stderr, " -B INT length of barcode\n"); +// fprintf(stderr, " -c input sequences are in the color space\n"); + fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); + fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); + fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); + fprintf(stderr, " -b the input read file is in the BAM format\n"); + fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); + fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); + fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); + fprintf(stderr, "\n"); + return 1; + } + if (opt->fnr > 0.0) { + int i, k; + for (i = 17, k = 0; i <= 250; ++i) { + int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); + k = l; + } + } + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(opt); + return 0; + } + bwa_aln_core(prefix, argv[optind+1], opt); + free(opt); free(prefix); + return 0; +} + +/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, +__cigar_op and __cigar_len while keeping stdaln stand alone */ +bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) +{ + uint32_t *cigar32; + bwa_cigar_t *cigar; + int i; + cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); + cigar = (bwa_cigar_t*)cigar32; + for (i = 0; i < *n_cigar; ++i) + cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); + return cigar; +} + diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtaln.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtaln.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,153 @@ +#ifndef BWTALN_H +#define BWTALN_H + +#include +#include "bwt.h" + +#define BWA_TYPE_NO_MATCH 0 +#define BWA_TYPE_UNIQUE 1 +#define BWA_TYPE_REPEAT 2 +#define BWA_TYPE_MATESW 3 + +#define SAM_FPD 1 // paired +#define SAM_FPP 2 // properly paired +#define SAM_FSU 4 // self-unmapped +#define SAM_FMU 8 // mate-unmapped +#define SAM_FSR 16 // self on the reverse strand +#define SAM_FMR 32 // mate on the reverse strand +#define SAM_FR1 64 // this is read one +#define SAM_FR2 128 // this is read two +#define SAM_FSC 256 // secondary alignment + +#define BWA_AVG_ERR 0.02 +#define BWA_MIN_RDLEN 35 // for read trimming + +#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum + +#ifndef bns_pac +#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) +#endif + +typedef struct { + bwtint_t w; + int bid; +} bwt_width_t; + +typedef struct { + uint32_t n_mm:16, n_gapo:8, n_gape:8; + int score; + bwtint_t k, l; +} bwt_aln1_t; + +typedef uint16_t bwa_cigar_t; +/* rgoya: If changing order of bytes, beware of operations like: + * s->cigar[0] += s->full_len - s->len; + */ +#define CIGAR_OP_SHIFT 14 +#define CIGAR_LN_MASK 0x3fff + +#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT) +#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK) +#define __cigar_create(__op, __len) ((__op)< +#include +#include +#include "bwtgap.h" +#include "bwtaln.h" + +#define STATE_M 0 +#define STATE_I 1 +#define STATE_D 2 + +#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) + +gap_stack_t *gap_init_stack2(int max_score) +{ + gap_stack_t *stack; + stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); + stack->n_stacks = max_score; + stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); + return stack; +} + +gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +{ + return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); +} + +void gap_destroy_stack(gap_stack_t *stack) +{ + int i; + for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack); + free(stack->stacks); + free(stack); +} + +static void gap_reset_stack(gap_stack_t *stack) +{ + int i; + for (i = 0; i != stack->n_stacks; ++i) + stack->stacks[i].n_entries = 0; + stack->best = stack->n_stacks; + stack->n_entries = 0; +} + +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, + int state, int is_diff, const gap_opt_t *opt) +{ + int score; + gap_entry_t *p; + gap_stack1_t *q; + score = aln_score(n_mm, n_gapo, n_gape, opt); + q = stack->stacks + score; + if (q->n_entries == q->m_entries) { + q->m_entries = q->m_entries? q->m_entries<<1 : 4; + q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); + } + p = q->stack + q->n_entries; + p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; + p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; + p->last_diff_pos = is_diff? i : 0; + ++(q->n_entries); + ++(stack->n_entries); + if (stack->best > score) stack->best = score; +} + +static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e) +{ + gap_stack1_t *q; + q = stack->stacks + stack->best; + *e = q->stack[q->n_entries - 1]; + --(q->n_entries); + --(stack->n_entries); + if (q->n_entries == 0 && stack->n_entries) { // reset best + int i; + for (i = stack->best + 1; i < stack->n_stacks; ++i) + if (stack->stacks[i].n_entries != 0) break; + stack->best = i; + } else if (stack->n_entries == 0) stack->best = stack->n_stacks; +} + +static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w) +{ + int i, j; + for (i = j = 0; i < last_diff_pos; ++i) { + if (w[i].w > x) w[i].w -= x; + else if (w[i].w == x) { + w[i].bid = 1; + w[i].w = max - (++j); + } // else should not happen + } +} + +static inline int int_log2(uint32_t v) +{ + int c = 0; + if (v & 0xffff0000u) { v >>= 16; c |= 16; } + if (v & 0xff00) { v >>= 8; c |= 8; } + if (v & 0xf0) { v >>= 4; c |= 4; } + if (v & 0xc) { v >>= 2; c |= 2; } + if (v & 0x2) c |= 1; + return c; +} + +bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, + bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) +{ + int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); + int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; + int best_cnt = 0; + int max_entries = 0, j, _j, n_aln, m_aln; + bwt_aln1_t *aln; + + m_aln = 4; n_aln = 0; + aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); + + // check whether there are too many N + for (j = _j = 0; j < len; ++j) + if (seq[j] > 3) ++_j; + if (_j > max_diff) { + *_n_aln = n_aln; + return aln; + } + + //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); + gap_reset_stack(stack); // reset stack + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); + + while (stack->n_entries) { + gap_entry_t e; + int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; + bwtint_t k, l, cnt_k[4], cnt_l[4], occ; + + if (max_entries < stack->n_entries) max_entries = stack->n_entries; + if (stack->n_entries > opt->max_entries) break; + gap_pop(stack, &e); // get the best entry + k = e.k; l = e.l; // SA interval + i = e.info&0xffff; // length + if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed + + m = max_diff - (e.n_mm + e.n_gapo); + if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; + if (m < 0) continue; + if (seed_width) { // apply seeding + m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); + if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; + } + //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); + if (i > 0 && m < width[i-1].bid) continue; + + // check whether a hit is found + hit_found = 0; + if (i == 0) hit_found = 1; + else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed + if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; + else continue; // no hit, skip + } + + if (hit_found) { // action for found hits + int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); + int do_add = 1; + //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); + if (n_aln == 0) { + best_score = score; + best_diff = e.n_mm + e.n_gapo; + if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape; + if (!(opt->mode & BWA_MODE_NONSTOP)) + max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour + } + if (score == best_score) best_cnt += l - k + 1; + else if (best_cnt > opt->max_top2) break; // top2b behaviour + if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat + for (j = 0; j != n_aln; ++j) + if (aln[j].k == k && aln[j].l == l) break; + if (j < n_aln) do_add = 0; + } + if (do_add) { // append + bwt_aln1_t *p; + gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); + if (n_aln == m_aln) { + m_aln <<= 1; + aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); + memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); + } + p = aln + n_aln; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; + p->k = k; p->l = l; + p->score = score; + ++n_aln; + } + continue; + } + + --i; + bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values + occ = l - k + 1; + // test whether diff is allowed + allow_diff = allow_M = 1; + if (i > 0) { + int ii = i - (len - opt->seed_len); + if (width[i-1].bid > m-1) allow_diff = 0; + else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; + if (seed_width && ii > 0) { + if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; + else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 + && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; + } + } + // indels + tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape; + if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) { + if (e.state == STATE_M) { // gap open + if (e.n_gapo < opt->max_gapo) { // gap open is allowed + // insertion + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + // deletion + for (j = 0; j != 4; ++j) { + k = bwt->L2[j] + cnt_k[j] + 1; + l = bwt->L2[j] + cnt_l[j]; + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + } + } + } else if (e.state == STATE_I) { // extention of an insertion + if (e.n_gape < opt->max_gape) // gap extention is allowed + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + } else if (e.state == STATE_D) { // extention of a deletion + if (e.n_gape < opt->max_gape) { // gap extention is allowed + if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { + for (j = 0; j != 4; ++j) { + k = bwt->L2[j] + cnt_k[j] + 1; + l = bwt->L2[j] + cnt_l[j]; + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + } + } + } + } + } + // mismatches + if (allow_diff && allow_M) { // mismatch is allowed + for (j = 1; j <= 4; ++j) { + int c = (seq[i] + j) & 3; + int is_mm = (j != 4 || seq[i] > 3); + k = bwt->L2[c] + cnt_k[c] + 1; + l = bwt->L2[c] + cnt_l[c]; + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + } + } else if (seq[i] < 4) { // try exact match only + int c = seq[i] & 3; + k = bwt->L2[c] + cnt_k[c] + 1; + l = bwt->L2[c] + cnt_l[c]; + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + } + } + + *_n_aln = n_aln; + //fprintf(stderr, "max_entries = %d\n", max_entries); + return aln; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtgap.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtgap.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,39 @@ +#ifndef BWTGAP_H_ +#define BWTGAP_H_ + +#include "bwt.h" +#include "bwtaln.h" + +typedef struct { // recursion stack + u_int32_t info; // score<<21 | i + u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; + bwtint_t k, l; // (k,l) is the SA region of [i,n-1] + int last_diff_pos; +} gap_entry_t; + +typedef struct { + int n_entries, m_entries; + gap_entry_t *stack; +} gap_stack1_t; + +typedef struct { + int n_stacks, best, n_entries; + gap_stack1_t *stacks; +} gap_stack_t; + +#ifdef __cplusplus +extern "C" { +#endif + + gap_stack_t *gap_init_stack2(int max_score); + gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); + void gap_destroy_stack(gap_stack_t *stack); + bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, + bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtindex.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtindex.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,158 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include +#include "bntseq.h" +#include "bwt.h" +#include "main.h" +#include "utils.h" + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); +void bwa_pac_rev_core(const char *fn, const char *fn_rev); + +int bwa_index(int argc, char *argv[]) +{ + char *prefix = 0, *str, *str2, *str3; + int c, algo_type = 0, is_color = 0, is_64 = 0; + clock_t t; + int64_t l_pac; + + while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { + switch (c) { + case 'a': // if -a is not set, algo_type will be determined later + if (strcmp(optarg, "div") == 0) algo_type = 1; + else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; + else if (strcmp(optarg, "is") == 0) algo_type = 3; + else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); + break; + case 'p': prefix = strdup(optarg); break; + case 'c': is_color = 1; break; + case '6': is_64 = 1; break; + default: return 1; + } + } + + if (optind + 1 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] \n\n"); + fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); + fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); + fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); +// fprintf(stderr, " -c build color-space index\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); + fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); + fprintf(stderr, " according to the length of the genome.\n\n"); + return 1; + } + if (prefix == 0) { + prefix = malloc(strlen(argv[optind]) + 4); + strcpy(prefix, argv[optind]); + if (is_64) strcat(prefix, ".64"); + } + str = (char*)calloc(strlen(prefix) + 10, 1); + str2 = (char*)calloc(strlen(prefix) + 10, 1); + str3 = (char*)calloc(strlen(prefix) + 10, 1); + + if (is_color == 0) { // nucleotide indexing + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack FASTA... "); + l_pac = bns_fasta2bntseq(fp, prefix, 0); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + } else { // color indexing + gzFile fp = xzopen(argv[optind], "r"); + strcat(strcpy(str, prefix), ".nt"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); + l_pac = bns_fasta2bntseq(fp, str, 0); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + { + char *tmp_argv[3]; + tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; + t = clock(); + fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); + bwa_pac2cspac(3, tmp_argv); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + } + if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT + { + strcpy(str, prefix); strcat(str, ".pac"); + strcpy(str2, prefix); strcat(str2, ".bwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); + if (algo_type == 2) bwt_bwtgen(str, str2); + else if (algo_type == 1 || algo_type == 3) { + bwt_t *bwt; + bwt = bwt_pac2bwt(str, algo_type == 3); + bwt_dump_bwt(str2, bwt); + bwt_destroy(bwt); + } + fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".bwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Update BWT... "); + bwt = bwt_restore_bwt(str); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(str, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); + l_pac = bns_fasta2bntseq(fp, prefix, 1); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".bwt"); + strcpy(str3, prefix); strcat(str3, ".sa"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); + bwt = bwt_restore_bwt(str); + bwt_cal_sa(bwt, 32); + bwt_dump_sa(str3, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + free(str3); free(str2); free(str); free(prefix); + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtio.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtio.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,77 @@ +#include +#include +#include +#include "bwt.h" +#include "utils.h" + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + fread(skipped, sizeof(bwtint_t), 4, fp); // skip + fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + fseek(fp, 0, SEEK_END); + bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); + fseek(fp, 0, SEEK_SET); + fread(&bwt->primary, sizeof(bwtint_t), 1, fp); + fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fread(bwt->bwt, 4, bwt->bwt_size, fp); + bwt->seq_len = bwt->L2[4]; + fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtmisc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtmisc.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,230 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include "bntseq.h" +#include "utils.h" +#include "main.h" +#include "bwt.h" + +#ifdef _DIVBWT +#include "divsufsort.h" +#endif + +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) +{ + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + fseek(fp, -1, SEEK_END); + pac_len = ftell(fp); + fread(&c, 1, 1, fp); + fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + fread(buf2, 1, pac_size, fp); + fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; + +/* this function is not memory efficient, but this will make life easier + Ideally we should also change .amb files as one 'N' in the nucleotide + sequence leads to two ambiguous colors. I may do this later... */ +uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) +{ + uint8_t *pac, *cspac; + bwtint_t i; + int c1, c2; + pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); + cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + rewind(bns->fp_pac); + c1 = pac[0]>>6; cspac[0] = c1<<6; + for (i = 1; i < bns->l_pac; ++i) { + c2 = pac[i>>2] >> (~i&3)*2 & 3; + cspac[i>>2] |= nst_color_space_table[(1< \n"); + return 1; + } + bns = bns_restore(argv[1]); + cspac = bwa_pac2cspac_core(bns); + bns_dump(bns, argv[2]); + // now write cspac + str = (char*)calloc(strlen(argv[2]) + 5, 1); + strcat(strcpy(str, argv[2]), ".pac"); + fp = xopen(str, "wb"); + fwrite(cspac, 1, bns->l_pac/4 + 1, fp); + ct = bns->l_pac % 4; + fwrite(&ct, 1, 1, fp); + fclose(fp); + bns_destroy(bns); + free(cspac); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtsw2.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtsw2.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,69 @@ +#ifndef LH3_BWTSW2_H +#define LH3_BWTSW2_H + +#include +#include "bntseq.h" +#include "bwt_lite.h" +#include "bwt.h" + +#define BSW2_FLAG_MATESW 0x100 +#define BSW2_FLAG_TANDEM 0x200 +#define BSW2_FLAG_MOVED 0x400 +#define BSW2_FLAG_RESCUED 0x800 + +typedef struct { + int skip_sw:16, hard_clip:16; + int a, b, q, r, t, qr, bw, max_ins; + int z, is, t_seeds, multi_2nd; + float mask_level, coef; + int n_threads, chunk_size; +} bsw2opt_t; + +typedef struct { + bwtint_t k, l; + uint32_t flag:18, n_seeds:13, is_rev:1; + int len, G, G2; + int beg, end; +} bsw2hit_t; + +typedef struct { + int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm; + uint32_t *cigar; +} bsw2aux_t; + +typedef struct { + int n, max; + bsw2hit_t *hits; + bsw2aux_t *aux; +} bwtsw2_t; + +typedef struct { + void *stack; + int max_l; + uint8_t *aln_mem; +} bsw2global_t; + +typedef struct { + int l, tid; + char *name, *seq, *qual, *sam; +} bsw2seq1_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bsw2opt_t *bsw2_init_opt(); + bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2); + void bsw2_destroy(bwtsw2_t *b); + + bsw2global_t *bsw2_global_init(); + void bsw2_global_destroy(bsw2global_t *_pool); + + void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtsw2_aux.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtsw2_aux.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,821 @@ +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef HAVE_PTHREAD +#include +#endif +#include "bntseq.h" +#include "bwt_lite.h" +#include "utils.h" +#include "bwtsw2.h" +#include "stdaln.h" +#include "kstring.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +#include "ksort.h" +#define __left_lt(a, b) ((a).end > (b).end) +KSORT_INIT(hit, bsw2hit_t, __left_lt) + +extern unsigned char nst_nt4_table[256]; + +unsigned char nt_comp_table[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', + 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', + 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n', + 'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' +}; + +extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); +extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); + +bsw2opt_t *bsw2_init_opt() +{ + bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); + o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; + o->bw = 50; + o->max_ins = 20000; + o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; + o->mask_level = 0.50f; o->coef = 5.5f; + o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; + return o; +} + +void bsw2_destroy(bwtsw2_t *b) +{ + int i; + if (b == 0) return; + if (b->aux) + for (i = 0; i < b->n; ++i) free(b->aux[i].cigar); + free(b->aux); free(b->hits); + free(b); +} + +bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) +{ + bwtsw2_t *p; + p = calloc(1, sizeof(bwtsw2_t)); + p->max = p->n = b->n; + if (b->n) { + kroundup32(p->max); + p->hits = calloc(p->max, sizeof(bsw2hit_t)); + memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); + } + return p; +} + +#define __gen_ap(par, opt) do { \ + int i; \ + for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ + for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \ + (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \ + (par).gap_end = (opt)->r; \ + (par).row = 5; (par).band_width = opt->bw; \ + } while (0) + +void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) +{ + int i, matrix[25]; + bwtint_t k; + uint8_t *target = 0, *query; + AlnParam par; + + par.matrix = matrix; + __gen_ap(par, opt); + query = calloc(lq, 1); + // sort according to the descending order of query end + ks_introsort(hit, b->n, b->hits); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + // reverse _query + for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; + // core loop + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; + int score, j; + path_t path; + p->n_seeds = 1; + if (p->l || p->k == 0) continue; + for (j = score = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { + if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; + ++score; + } + } + if (score) continue; + if (lt > p->k) lt = p->k; + for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; + lt = j; + score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); + if (score > p->G) { // extensible + p->G = score; + p->len += path.i; + p->beg -= path.j; + p->k -= path.i; + } + } + free(query); free(target); +} + +void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) +{ + int i, matrix[25]; + bwtint_t k; + uint8_t *target; + AlnParam par; + + par.matrix = matrix; + __gen_ap(par, opt); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; + int j, score; + path_t path; + if (p->l) continue; + for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; + lt = j; + score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); +// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); + if (score >= p->G) { + p->G = score; + p->len = path.i; + p->end = path.j + p->beg; + } + } + free(target); +} + +/* generate CIGAR array(s) in b->cigar[] */ +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +{ + uint8_t *target; + int i, matrix[25]; + AlnParam par; + path_t *path; + + par.matrix = matrix; + __gen_ap(par, opt); + i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length + target = calloc(i, 1); + path = calloc(i + lq, sizeof(path_t)); + // generate CIGAR + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + bsw2aux_t *q = b->aux + i; + uint8_t *query; + bwtint_t k; + int score, path_len, beg, end; + if (p->l) continue; + beg = (p->flag & 0x10)? lq - p->end : p->beg; + end = (p->flag & 0x10)? lq - p->beg : p->end; + query = seq[(p->flag & 0x10)? 1 : 0] + beg; + for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here + target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; + score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); + q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); +#if 0 + if (name && score != p->G) { // debugging only + int j, glen = 0; + for (j = 0; j < q->n_cigar; ++j) + if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) + glen += q->cigar[j]>>4; + fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", + __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); + } +#endif + if (beg != 0 || end < lq) { // write soft clipping + q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); + if (beg != 0) { + memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); + q->cigar[0] = beg<<4 | 4; + ++q->n_cigar; + } + if (end < lq) { + q->cigar[q->n_cigar] = (lq - end)<<4 | 4; + ++q->n_cigar; + } + } + } + free(target); free(path); +} + +/* this is for the debugging purpose only */ +void bsw2_debug_hits(const bwtsw2_t *b) +{ + int i; + printf("# raw hits: %d\n", b->n); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + if (p->G > 0) + printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); + } +} + +static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) +{ + int i; + if (b[0]->n + b[1]->n > b[0]->max) { + b[0]->max = b[0]->n + b[1]->n; + b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); + } + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[0]->hits + b[0]->n + i; + *p = b[1]->hits[i]; + if (is_reverse) { + int x = p->beg; + p->beg = l - p->end; + p->end = l - x; + p->flag |= 0x10; + } + } + b[0]->n += b[1]->n; + bsw2_destroy(b[1]); + b[1] = 0; +} +/* seq[0] is the forward sequence and seq[1] is the reverse complement. */ +static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, + int l, uint8_t *seq[2], bsw2global_t *pool) +{ + extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); + bwtsw2_t *b[2], **bb[2], **_b, *p; + int k, j; + bwtl_t *query; + query = bwtl_seq2bwtl(l, seq[0]); + _b = bsw2_core(bns, opt, query, target, pool); + bwtl_destroy(query); + for (k = 0; k < 2; ++k) { + bb[k] = calloc(2, sizeof(void*)); + bb[k][0] = calloc(1, sizeof(bwtsw2_t)); + bb[k][1] = calloc(1, sizeof(bwtsw2_t)); + } + for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand + for (j = 0; j < _b[k]->n; ++j) { + bsw2hit_t *q; + p = bb[_b[k]->hits[j].is_rev][k]; + if (p->n == p->max) { + p->max = p->max? p->max<<1 : 8; + p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); + } + q = &p->hits[p->n++]; + *q = _b[k]->hits[j]; + if (_b[k]->hits[j].is_rev) { + int x = q->beg; + q->beg = l - q->end; + q->end = l - x; + } + } + } + b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" + bsw2_chain_filter(opt, l, b); + for (k = 0; k < 2; ++k) { + bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); + merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here + bsw2_resolve_duphits(0, 0, bb[k][0], 0); + bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); + b[k] = bb[k][0]; + free(bb[k]); + } + merge_hits(b, l, 1); // again, b[1] is merged to b[0] + bsw2_resolve_query_overlaps(b[0], opt->mask_level); + bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b); + return b[0]; +} + +/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */ +static void flag_fr(bwtsw2_t *b[2]) +{ + int i, j; + for (i = 0; i < b[0]->n; ++i) { + bsw2hit_t *p = b[0]->hits + i; + p->flag |= 0x10000; + } + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[1]->hits + i; + p->flag |= 0x20000; + } + for (i = 0; i < b[0]->n; ++i) { + bsw2hit_t *p = b[0]->hits + i; + for (j = 0; j < b[1]->n; ++j) { + bsw2hit_t *q = b[1]->hits + j; + if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) { + q->flag |= 0x30000; p->flag |= 0x30000; + break; + } + } + } +} + +typedef struct { + int n, max; + bsw2seq1_t *seq; +} bsw2seq_t; + +static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) +{ + // FIXME: this routine does not work if the query bridge three reference sequences + int32_t coor, refl, lq; + int x, y, i, seqid; + bns_cnt_ambi(bns, p->k, p->len, &seqid); + coor = p->k - bns->anns[seqid].offset; + refl = bns->anns[seqid].len; + x = coor; y = 0; + // test if the alignment goes beyond the boundary + for (i = 0; i < n_cigar; ++i) { + int op = cigar[i]&0xf, ln = cigar[i]>>4; + if (op == 1 || op == 4 || op == 5) y += ln; + else if (op == 2) x += ln; + else x += ln, y += ln; + } + lq = y; // length of the query sequence + if (x > refl) { // then fix it + int j, nc, mq[2], nlen[2]; + uint32_t *cn; + bwtint_t kk = 0; + nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; + cn = calloc(n_cigar + 3, 4); + x = coor; y = 0; + for (i = j = 0; i < n_cigar; ++i) { + int op = cigar[i]&0xf, ln = cigar[i]>>4; + if (op == 4 || op == 5 || op == 1) { // ins or clipping + y += ln; + cn[j++] = cigar[i]; + } else if (op == 2) { // del + if (x + ln >= refl && nc == 0) { + cn[j++] = (uint32_t)(lq - y)<<4 | 4; + nc = j; + cn[j++] = (uint32_t)y<<4 | 4; + kk = p->k + (x + ln - refl); + nlen[0] = x - coor; + nlen[1] = p->len - nlen[0] - ln; + } else cn[j++] = cigar[i]; + x += ln; + } else if (op == 0) { // match + if (x + ln >= refl && nc == 0) { + // FIXME: not consider a special case where a split right between M and I + cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M + cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S + nc = j; + mq[0] += refl - x; + cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4; + if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0; + mq[1] += x + ln - refl; + kk = bns->anns[seqid].offset + refl; + nlen[0] = refl - coor; + nlen[1] = p->len - nlen[0]; + } else { + cn[j++] = cigar[i]; + mq[nc?1:0] += ln; + } + x += ln; y += ln; + } + } + if (mq[0] > mq[1]) { // then take the first alignment + n_cigar = nc; + memcpy(cigar, cn, 4 * nc); + p->len = nlen[0]; + } else { + p->k = kk; p->len = nlen[1]; + n_cigar = j - nc; + memcpy(cigar, cn + nc, 4 * (j - nc)); + } + free(cn); + } + return n_cigar; +} + +static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq) +{ + int k, x, n_mm = 0, i, n_gap = 0; + bwtint_t y; + x = 0; y = p->k; + for (k = 0; k < n_cigar; ++k) { + int op = cigar[k]&0xf; + int len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) { + int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3; + if (seq[x + i] != ref) ++n_mm; + } + x += len; y += len; + } else if (op == 1) x += len, n_gap += len; + else if (op == 2) y += len, n_gap += len; + else if (op == 4) x += len; + } + return n_mm + n_gap; +} + +static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +{ + int i; + // allocate for b->aux + if (b->n<<1 < b->max) { + b->max = b->n; + kroundup32(b->max); + b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); + } + b->aux = calloc(b->n, sizeof(bsw2aux_t)); + // generate CIGAR + gen_cigar(opt, qlen, seq, pac, b, name); + // fix CIGAR, generate mapQ, and write chromosomal position + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = &b->hits[i]; + bsw2aux_t *q = &b->aux[i]; + q->flag = p->flag & 0xfe; + q->isize = 0; + if (p->l == 0) { // unique hit + float c = 1.0; + int subo; + // fix out-of-boundary CIGAR + q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); + // compute the NM tag + q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]); + // compute mapQ + subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (q->qual > 250) q->qual = 250; + if (q->qual < 0) q->qual = 0; + if (p->flag&1) q->qual = 0; // this is a random hit + q->pqual = q->qual; // set the paired qual as qual + // get the chromosomal position + q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr); + q->pos = p->k - bns->anns[q->chr].offset; + } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0; + } +} + +static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) +{ + int i; + if (m == 0) return; + // update flag, mchr and mpos + for (i = 0; i < b->n; ++i) { + bsw2aux_t *q = &b->aux[i]; + q->flag |= 1; // paired + if (m->n == 0) q->flag |= 8; // mate unmapped + if (m->n == 1) { + q->mchr = m->aux[0].chr; + q->mpos = m->aux[0].pos; + if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand + if (q->chr == q->mchr) { // set insert size + if (q->mpos + m->hits[0].len > q->pos) + q->isize = q->mpos + m->hits[0].len - q->pos; + else q->isize = q->mpos - q->pos - b->hits[0].len; + } else q->isize = 0; + } else q->mchr = q->mpos = -1; + } + // update mapping quality + if (b->n == 1 && m->n == 1) { + bsw2hit_t *p = &b->hits[0]; + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman + if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20) + b->aux[0].pqual = 20; + if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual] + b->aux[0].pqual += 20; + if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual; + } + } + } +} + +/* generate SAM lines for a sequence in ks with alignment stored in + * b. ks->name and ks->seq will be freed and set to NULL in the end. */ +static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) +{ + int i, k; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + if (b == 0 || b->n == 0) { // no hits + ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name); + for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str); + if (ks->qual) { + kputc('\t', &str); + for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str); + } else kputs("\t*", &str); + kputc('\n', &str); + } + for (i = 0; b && i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + bsw2aux_t *q = b->aux + i; + int j, beg, end, type = 0; + // print mandatory fields before SEQ + ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); + ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); + if (p->l == 0) { // not a repetitive hit + ksprintf(&str, "\t%d\t", q->pqual); + for (k = 0; k < q->n_cigar; ++k) + ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); + } else ksprintf(&str, "\t0\t*"); + if (!is_pe) kputs("\t*\t0\t0\t", &str); + else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); + // get the sequence begin and end + beg = 0; end = ks->l; + if (opt->hard_clip) { + if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; + if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; + } + for (j = beg; j < end; ++j) { + if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); + else kputc(ks->seq[j], &str); + } + // print base quality if present + if (ks->qual) { + kputc('\t', &str); + for (j = beg; j < end; ++j) { + if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); + else kputc(ks->qual[j], &str); + } + } else ksprintf(&str, "\t*"); + // print optional tags + ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); + if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); + if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); + if (p->flag&BSW2_FLAG_MATESW) type |= 1; + if (p->flag&BSW2_FLAG_TANDEM) type |= 2; + if (type) ksprintf(&str, "\tXT:i:%d", type); + kputc('\n', &str); + } + ks->sam = str.s; + free(ks->seq); ks->seq = 0; + free(ks->qual); ks->qual = 0; + free(ks->name); ks->name = 0; +} + +static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) +{ + double ll = log(qlen); + int i, k; + *dst = *src; + if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); + // set band width: the query length sets a boundary on the maximum band width + k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); + i = (qlen * dst->a - dst->a - dst->t) / dst->r; + if (k > i) k = i; + if (k < 1) k = 1; // I do not know if k==0 causes troubles + dst->bw = src->bw < k? src->bw : k; +} + +/* Core routine to align reads in _seq. It is separated from + * process_seqs() to realize multi-threading */ +static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) +{ + int x; + bsw2opt_t opt; + bsw2global_t *pool = bsw2_global_init(); + bwtsw2_t **buf; + buf = calloc(_seq->n, sizeof(void*)); + for (x = 0; x < _seq->n; ++x) { + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2], *rseq[2]; + int i, l, k; + bwtsw2_t *b[2]; + l = p->l; + update_opt(&opt, _opt, p->l); + if (pool->max_l < l) { // then enlarge working space for aln_extend_core() + int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; + pool->max_l = l; + pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); + } + // set seq[2] and rseq[2] + seq[0] = calloc(l * 4, 1); + seq[1] = seq[0] + l; + rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; + // convert sequences to 2-bit representation + for (i = k = 0; i < l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled + seq[0][i] = c; + seq[1][l-1-i] = 3 - c; + rseq[0][l-1-i] = 3 - c; + rseq[1][i] = c; + } + if (l - k < opt.t) { // too few unambiguous bases + buf[x] = calloc(1, sizeof(bwtsw2_t)); + free(seq[0]); continue; + } + // alignment + b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool); + for (k = 0; k < b[0]->n; ++k) + if (b[0]->hits[k].n_seeds < opt.t_seeds) break; + if (k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = &b[1]->hits[i]; + int x = p->beg; + p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand + p->beg = l - p->end; + p->end = l - x; + } + flag_fr(b); + merge_hits(b, l, 0); + bsw2_resolve_duphits(0, 0, b[0], 0); + bsw2_resolve_query_overlaps(b[0], opt.mask_level); + } else b[1] = 0; + // generate CIGAR and print SAM + buf[x] = bsw2_dup_no_cigar(b[0]); + // free + free(seq[0]); + bsw2_destroy(b[0]); + } + if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); + for (x = 0; x < _seq->n; ++x) { + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2]; + int i; + seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; + for (i = 0; i < p->l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) c = (int)(drand48() * 4); + seq[0][i] = c; + seq[1][p->l-1-i] = 3 - c; + } + update_opt(&opt, _opt, p->l); + write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); + free(seq[0]); + } + for (x = 0; x < _seq->n; ++x) { + if (is_pe) update_mate_aux(buf[x], buf[x^1]); + print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]); + } + for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); + free(buf); + bsw2_global_destroy(pool); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid, is_pe; + bsw2seq_t *_seq; + const bsw2opt_t *_opt; + const bntseq_t *bns; + uint8_t *pac; + const bwt_t *target; +} thread_aux_t; + +/* another interface to bsw2_aln_core() to facilitate pthread_create() */ +static void *worker(void *data) +{ + thread_aux_t *p = (thread_aux_t*)data; + bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); + return 0; +} +#endif + +/* process sequences stored in _seq, generate SAM lines for these + * sequences and reset _seq afterwards. */ +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) +{ + int i; + is_pe = is_pe? 1 : 0; + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; + p->pac = pac; p->target = target; + p->_seq = calloc(1, sizeof(bsw2seq_t)); + p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; + p->_seq->n = 0; + p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); + } + for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + p->seq[p->n++] = _seq->seq[i]; + } + for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]); + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0; + for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + _seq->seq[i] = p->seq[p->n++]; + } + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + free(p->_seq->seq); + free(p->_seq); + } + free(data); free(tid); + } +#else + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); +#endif + + // print and reset + for (i = 0; i < _seq->n; ++i) { + bsw2seq1_t *p = _seq->seq + i; + if (p->sam) printf("%s", p->sam); + free(p->name); free(p->seq); free(p->qual); free(p->sam); + p->tid = -1; p->l = 0; + p->name = p->seq = p->qual = p->sam = 0; + } + fflush(stdout); + _seq->n = 0; +} + +static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) +{ + p->tid = -1; + p->l = ks->seq.l; + p->name = strdup(ks->name.s); + p->seq = strdup(ks->seq.s); + p->qual = ks->qual.l? strdup(ks->qual.s) : 0; + p->sam = 0; +} + +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) +{ + gzFile fp, fp2; + kseq_t *ks, *ks2; + int l, size = 0, is_pe = 0; + uint8_t *pac; + bsw2seq_t *_seq; + + pac = calloc(bns->l_pac/4+1, 1); + if (pac == 0) { + fprintf(stderr, "[bsw2_aln] insufficient memory!\n"); + return; + } + for (l = 0; l < bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + fp = xzopen(fn, "r"); + ks = kseq_init(fp); + _seq = calloc(1, sizeof(bsw2seq_t)); + if (fn2) { + fp2 = xzopen(fn2, "r"); + ks2 = kseq_init(fp2); + is_pe = 1; + } else fp2 = 0, ks2 = 0, is_pe = 0; + while (kseq_read(ks) >= 0) { + if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/') + ks->name.l -= 2, ks->name.s[ks->name.l] = 0; + if (_seq->n == _seq->max) { + _seq->max = _seq->max? _seq->max<<1 : 1024; + _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); + } + kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); + size += ks->seq.l; + if (ks2) { + if (kseq_read(ks2) >= 0) { + if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/') + ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0; + kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge + size += ks->seq.l; + } else { + fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__); + is_pe = 0; + } + } + if (size > opt->chunk_size * opt->n_threads) { + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); + process_seqs(_seq, opt, bns, pac, target, is_pe); + size = 0; + } + } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); + process_seqs(_seq, opt, bns, pac, target, is_pe); + // free + free(pac); + free(_seq->seq); free(_seq); + kseq_destroy(ks); + gzclose(fp); + if (fn2) { + kseq_destroy(ks2); + gzclose(fp2); + } +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtsw2_chain.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtsw2_chain.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,107 @@ +#include +#include "bwtsw2.h" + +typedef struct { + uint32_t tbeg, tend; + int qbeg, qend; + uint32_t flag:1, idx:31; + int chain; // also reuse as a counter +} hsaip_t; + +#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg) + +#include "ksort.h" +KSORT_INIT(hsaip, hsaip_t, _hsaip_lt) + +static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain) +{ + int j, k, m = 0; + ks_introsort(hsaip, n, z); + for (j = 0; j < n; ++j) { + hsaip_t *p = z + j; + for (k = m - 1; k >= 0; --k) { + hsaip_t *q = chain + k; + int x = p->qbeg - q->qbeg; // always positive + int y = p->tbeg - q->tbeg; + if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) { + if (p->qend > q->qend) q->qend = p->qend; + if (p->tend > q->tend) q->tend = p->tend; + ++q->chain; + p->chain = shift + k; + break; + } + } + if (k < 0) { + chain[m] = *p; + chain[m].chain = 1; + chain[m].idx = p->chain = shift + m; + ++m; + } + } + return m; +} + +void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) +{ + hsaip_t *z[2], *chain[2]; + int i, j, k, n[2], m[2]; + char *flag; + // initialization + n[0] = b[0]->n; n[1] = b[1]->n; + z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + z[1] = z[0] + n[0]; + chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + for (k = j = 0; k < 2; ++k) { + for (i = 0; i < b[k]->n; ++i) { + bsw2hit_t *p = b[k]->hits + i; + hsaip_t *q = z[k] + i; + q->flag = k; q->idx = i; + q->tbeg = p->k; q->tend = p->k + p->len; + q->chain = -1; + q->qbeg = p->beg; q->qend = p->end; + } + } + // chaining + m[0] = chaining(opt, 0, n[0], z[0], chain[0]); + chain[1] = chain[0] + m[0]; + m[1] = chaining(opt, m[0], n[1], z[1], chain[1]); + // change query coordinate on the reverse strand + for (k = 0; k < m[1]; ++k) { + hsaip_t *p = chain[1] + k; + int tmp = p->qbeg; + p->qbeg = len - p->qend; p->qend = len - tmp; + } + // filtering + flag = calloc(m[0] + m[1], 1); + ks_introsort(hsaip, m[0] + m[1], chain[0]); + for (k = 1; k < m[0] + m[1]; ++k) { + hsaip_t *p = chain[0] + k; + for (j = 0; j < k; ++j) { + hsaip_t *q = chain[0] + j; + if (flag[q->idx]) continue; + if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) { + flag[p->idx] = 1; + break; + } + } + } + for (k = 0; k < n[0] + n[1]; ++k) { + hsaip_t *p = z[0] + k; + if (flag[p->chain]) + b[p->flag]->hits[p->idx].G = 0; + } + free(flag); + // squeeze out filtered elements in b[2] + for (k = 0; k < 2; ++k) { + for (j = i = 0; j < n[k]; ++j) { + bsw2hit_t *p = b[k]->hits + j; + if (p->G) { + if (i != j) b[k]->hits[i++] = *p; + else ++i; + } + } + b[k]->n = i; + } + // free + free(z[0]); free(chain[0]); +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtsw2_core.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtsw2_core.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,615 @@ +#include +#include +#include +#include +#include +#include "bwt_lite.h" +#include "bwtsw2.h" +#include "bwt.h" +#include "kvec.h" + +typedef struct { + bwtint_t k, l; +} qintv_t; + +#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l) +#define qintv_hash(a) ((a).k>>7^(a).l<<17) + +#include "khash.h" +KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq) +KHASH_MAP_INIT_INT64(64, uint64_t) + +#define MINUS_INF -0x3fffffff +#define MASK_LEVEL 0.90f + +struct __mempool_t; +static void mp_destroy(struct __mempool_t*); +typedef struct { + bwtint_t qk, ql; + int I, D, G; + uint32_t pj:2, qlen:30; + int tlen; + int ppos, upos; + int cpos[4]; +} bsw2cell_t; + +#include "ksort.h" +KSORT_INIT_GENERIC(int) +#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2)) +KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) + +static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; + +typedef struct { + int n, max; + uint32_t tk, tl; // this is fine + bsw2cell_t *array; +} bsw2entry_t, *bsw2entry_p; + +/* --- BEGIN: Stack operations --- */ +typedef struct { + int n_pending; + kvec_t(bsw2entry_p) stack0, pending; + struct __mempool_t *pool; +} bsw2stack_t; + +#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0) +static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); } +inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); } +inline static bsw2entry_p stack_pop(bsw2stack_t *s) +{ + assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0)); + return kv_pop(s->stack0); +} +/* --- END: Stack operations --- */ + +/* --- BEGIN: memory pool --- */ +typedef struct __mempool_t { + int cnt; // if cnt!=0, then there must be memory leak + kvec_t(bsw2entry_p) pool; +} mempool_t; +inline static bsw2entry_p mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); + else return kv_pop(mp->pool); +} +inline static void mp_free(mempool_t *mp, bsw2entry_p e) +{ + --mp->cnt; e->n = 0; + kv_push(bsw2entry_p, mp->pool, e); +} +static void mp_destroy(struct __mempool_t *mp) +{ + int i; + for (i = 0; i != kv_size(mp->pool); ++i) { + free(kv_A(mp->pool, i)->array); + free(kv_A(mp->pool, i)); + } + kv_destroy(mp->pool); + free(mp); +} +/* --- END: memory pool --- */ + +/* --- BEGIN: utilities --- */ +static khash_t(64) *bsw2_connectivity(const bwtl_t *b) +{ + khash_t(64) *h; + uint32_t k, l, cntk[4], cntl[4]; // this is fine + uint64_t x; + khiter_t iter; + int j, ret; + kvec_t(uint64_t) stack; + + kv_init(stack); + h = kh_init(64); + kh_resize(64, h, b->seq_len * 4); + x = b->seq_len; + kv_push(uint64_t, stack, x); + while (kv_size(stack)) { + x = kv_pop(stack); + k = x>>32; l = (uint32_t)x; + bwtl_2occ4(b, k-1, l, cntk, cntl); + for (j = 0; j != 4; ++j) { + k = b->L2[j] + cntk[j] + 1; + l = b->L2[j] + cntl[j]; + if (k > l) continue; + x = (uint64_t)k << 32 | l; + iter = kh_put(64, h, x, &ret); + if (ret) { // if not present + kh_value(h, iter) = 1; + kv_push(uint64_t, stack, x); + } else ++kh_value(h, iter); + } + } + kv_destroy(stack); + //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h)); + return h; +} +// pick up top T matches at a node +static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) +{ + int i, *a, n, x; + if (u->n <= T) return; + if (aux->max < u->n) { + aux->max = u->n; + aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); + } + a = (int*)aux->array; + for (i = n = 0; i != u->n; ++i) + if (u->array[i].ql && u->array[i].G > 0) + a[n++] = -u->array[i].G; + if (n <= T) return; + x = -ks_ksmall(int, n, a, T); + n = 0; + for (i = 0; i < u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->G == x) ++n; + if (p->G < x || (p->G == x && n >= T)) { + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1; + } + } +} +// remove duplicated cells +static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash) +{ + int i, ret, j; + khiter_t k; + qintv_t key; + kh_clear(qintv, hash); + for (i = 0; i != u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->ql == 0) continue; + key.k = p->qk; key.l = p->ql; + k = kh_put(qintv, hash, key, &ret); + j = -1; + if (ret == 0) { + if ((uint32_t)kh_value(hash, k) >= p->G) j = i; + else { + j = kh_value(hash, k)>>32; + kh_value(hash, k) = (uint64_t)i<<32 | p->G; + } + } else kh_value(hash, k) = (uint64_t)i<<32 | p->G; + if (j >= 0) { + p = u->array + j; + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; + } + } +} +// merge two entries +static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b) +{ + int i; + if (u->n + v->n >= u->max) { + u->max = u->n + v->n; + u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); + } + for (i = 0; i != v->n; ++i) { + bsw2cell_t *p = v->array + i; + if (p->ppos >= 0) p->ppos += u->n; + if (p->cpos[0] >= 0) p->cpos[0] += u->n; + if (p->cpos[1] >= 0) p->cpos[1] += u->n; + if (p->cpos[2] >= 0) p->cpos[2] += u->n; + if (p->cpos[3] >= 0) p->cpos[3] += u->n; + } + memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t)); + u->n += v->n; +} + +static inline bsw2cell_t *push_array_p(bsw2entry_t *e) +{ + if (e->n == e->max) { + e->max = e->max? e->max<<1 : 256; + e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); + } + return e->array + e->n; +} + +static inline double time_elapse(const struct rusage *curr, const struct rusage *last) +{ + long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec); + long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec); + return (double)t1 + t2 * 1e-6; +} +/* --- END: utilities --- */ + +/* --- BEGIN: processing partial hits --- */ +static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u) +{ + int i; + uint32_t k; // this is fine + for (i = 0; i < u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->G < thres) continue; + for (k = u->tk; k <= u->tl; ++k) { + int beg, end; + bsw2hit_t *q = 0; + beg = bwt->sa[k]; end = beg + p->tlen; + if (p->G > hits[beg*2].G) { + hits[beg*2+1] = hits[beg*2]; + q = hits + beg * 2; + } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1; + if (q) { + q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G; + q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G; + q->flag = q->n_seeds = 0; + } + } + } +} +/* "narrow hits" are node-to-node hits that have a high score and + * are not so repetitive (|SA interval|<=IS). */ +static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS) +{ + int i; + for (i = 0; i < u->n; ++i) { + bsw2hit_t *q; + bsw2cell_t *p = u->array + i; + if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit + if (b1->max == b1->n) { + b1->max = b1->max? b1->max<<1 : 4; + b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); + } + q = &b1->hits[b1->n++]; + q->k = p->qk; q->l = p->ql; + q->len = p->qlen; + q->G = p->G; q->G2 = 0; + q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen; + q->flag = 0; + // delete p + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; + } + } +} +/* after this, "narrow SA hits" will be expanded and the coordinates + * will be obtained and stored in b->hits[*].k. */ +int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) +{ + int i, j, n, is_rev; + if (b->n == 0) return 0; + if (bwt && bns) { // convert to chromosomal coordinates if requested + int old_n = b->n; + bsw2hit_t *old_hits = b->hits; + for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated + bsw2hit_t *p = old_hits + i; + if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; + else if (p->G > 0) ++n; + } + b->n = b->max = n; + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + for (i = j = 0; i < old_n; ++i) { + bsw2hit_t *p = old_hits + i; + if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive + bwtint_t k; + if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; + for (k = p->k; k <= p->l; ++k) { + b->hits[j] = *p; + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); + b->hits[j].l = 0; + b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len - 1; + ++j; + } + } else if (p->G > 0) { + b->hits[j] = *p; + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev); + b->hits[j].l = 0; + b->hits[j].flag |= 1; + b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len - 1; + ++j; + } + } + free(old_hits); + } + for (i = j = 0; i < b->n; ++i) // squeeze out empty elements + if (b->hits[i].G) b->hits[j++] = b->hits[i]; + b->n = j; + ks_introsort(hitG, b->n, b->hits); + for (i = 1; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + for (j = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + int compatible = 1; + if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates + if (p->l == 0 && q->l == 0) { + int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap + if (qol < 0) qol = 0; + if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { + int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) + - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap + if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) + compatible = 0; + } + } + if (!compatible) { + p->G = 0; + if (q->G2 < p->G2) q->G2 = p->G2; + break; + } + } + } + n = i; + for (i = j = 0; i < n; ++i) { + if (b->hits[i].G == 0) continue; + if (i != j) b->hits[j++] = b->hits[i]; + else ++j; + } + b->n = j; + return b->n; +} + +int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level) +{ + int i, j, n; + if (b->n == 0) return 0; + ks_introsort(hitG, b->n, b->hits); + { // choose a random one + int G0 = b->hits[0].G; + for (i = 1; i < b->n; ++i) + if (b->hits[i].G != G0) break; + j = (int)(i * drand48()); + if (j) { + bsw2hit_t tmp; + tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp; + } + } + for (i = 1; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int all_compatible = 1; + if (p->G == 0) break; + for (j = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + int64_t tol = 0; + int qol, compatible = 0; + float fol; + if (q->G == 0) continue; + qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); + if (qol < 0) qol = 0; + if (p->l == 0 && q->l == 0) { + tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) + - (p->k > q->k? p->k : q->k); + if (tol < 0) tol = 0; + } + fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg); + if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1; + if (!compatible) { + if (q->G2 < p->G) q->G2 = p->G; + all_compatible = 0; + } + } + if (!all_compatible) p->G = 0; + } + n = i; + for (i = j = 0; i < n; ++i) { + if (b->hits[i].G == 0) continue; + if (i != j) b->hits[j++] = b->hits[i]; + else ++j; + } + b->n = j; + return j; +} +/* --- END: processing partial hits --- */ + +/* --- BEGIN: global mem pool --- */ +bsw2global_t *bsw2_global_init() +{ + bsw2global_t *pool; + bsw2stack_t *stack; + pool = calloc(1, sizeof(bsw2global_t)); + stack = calloc(1, sizeof(bsw2stack_t)); + stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); + pool->stack = (void*)stack; + return pool; +} + +void bsw2_global_destroy(bsw2global_t *pool) +{ + stack_destroy((bsw2stack_t*)pool->stack); + free(pool->aln_mem); + free(pool); +} +/* --- END: global mem pool --- */ + +static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4]) +{ + int G = c[3]? c[3]->G + match_score : MINUS_INF; + if (c[1]) { + c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr; + if (c[0]->I > G) G = c[0]->I; + } else c[0]->I = MINUS_INF; + if (c[2]) { + c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr; + if (c[0]->D > G) G = c[0]->D; + } else c[0]->D = MINUS_INF; + return(c[0]->G = G); +} + +static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s) +{ + bsw2entry_t *u; + bsw2cell_t *x; + + u = mp_alloc(s->pool); + u->tk = 0; u->tl = target->seq_len; + x = push_array_p(u); + *x = g_default_cell; + x->G = 0; x->qk = 0; x->ql = query->seq_len; + u->n++; + stack_push0(s, u); +} +/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ +bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) +{ + bsw2stack_t *stack = (bsw2stack_t*)pool->stack; + bwtsw2_t *b, *b1, **b_ret; + int i, j, score_mat[16], *heap, heap_size, n_tot = 0; + struct rusage curr, last; + khash_t(qintv) *rhash; + khash_t(64) *chash; + + // initialize connectivity hash (chash) + chash = bsw2_connectivity(target); + // calculate score matrix + for (i = 0; i != 4; ++i) + for (j = 0; j != 4; ++j) + score_mat[i<<2|j] = (i == j)? opt->a : -opt->b; + // initialize other variables + rhash = kh_init(qintv); + init_bwtsw2(target, query, stack); + heap_size = opt->z; + heap = calloc(heap_size, sizeof(int)); + // initialize the return struct + b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b->n = b->max = target->seq_len * 2; + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b_ret = calloc(2, sizeof(void*)); + b_ret[0] = b; b_ret[1] = b1; + // initialize timer + getrusage(0, &last); + // the main loop: traversal of the DAG + while (!stack_isempty(stack)) { + int old_n, tj; + bsw2entry_t *v; + uint32_t tcntk[4], tcntl[4]; + bwtint_t k, l; + + v = stack_pop(stack); old_n = v->n; + n_tot += v->n; + + for (i = 0; i < v->n; ++i) { // test max depth and band width + bsw2cell_t *p = v->array + i; + if (p->ql == 0) continue; + if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) { + p->qk = p->ql = 0; + if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5; + } + } + + // get Occ for the DAG + bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl); + for (tj = 0; tj != 4; ++tj) { // descend to the children + bwtint_t qcntk[4], qcntl[4]; + int qj, *curr_score_mat = score_mat + tj * 4; + khiter_t iter; + bsw2entry_t *u; + + k = target->L2[tj] + tcntk[tj] + 1; + l = target->L2[tj] + tcntl[tj]; + if (k > l) continue; + // update counter + iter = kh_get(64, chash, (uint64_t)k<<32 | l); + --kh_value(chash, iter); + // initialization + u = mp_alloc(stack->pool); + u->tk = k; u->tl = l; + memset(heap, 0, sizeof(int) * opt->z); + // loop through all the nodes in v + for (i = 0; i < v->n; ++i) { + bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G + int is_added = 0; + if (p->ql == 0) continue; // deleted node + c[0] = x = push_array_p(u); + x->G = MINUS_INF; + p->upos = x->upos = -1; + if (p->ppos >= 0) { // parent has been visited + c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0; + c[3] = v->array + p->ppos; c[2] = p; + if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x + x->ppos = v->array[p->ppos].upos; // the parent pos in u + p->upos = u->n++; // the current pos in u + if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u + is_added = 1; + } + } else { + x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr; + if (x->D > 0) { + x->G = x->D; + x->I = MINUS_INF; x->ppos = -1; + p->upos = u->n++; + is_added = 1; + } + } + if (is_added) { // x has been added to u->array. fill the remaining variables + x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; + x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1; + if (x->G > -heap[0]) { + heap[0] = -x->G; + ks_heapadjust(int, 0, heap_size, heap); + } + } + if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v + if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) { + bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl); + for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie + if (p->cpos[qj] != -1) continue; // this node will be visited later + k = query->L2[qj] + qcntk[qj] + 1; + l = query->L2[qj] + qcntl[qj]; + if (k > l) { p->cpos[qj] = -2; continue; } + x = push_array_p(v); + p = v->array + i; // p may not point to the correct position after realloc + x->G = x->I = x->D = MINUS_INF; + x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen; + x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; + p->cpos[qj] = v->n++; + } // ~for(qj) + } // ~if(p->cpos[]) + } // ~if + } // ~for(i) + if (u->n) save_hits(target, opt->t, b->hits, u); + { // push u to the stack (or to the pending array) + uint32_t cnt, pos; + cnt = (uint32_t)kh_value(chash, iter); + pos = kh_value(chash, iter)>>32; + if (pos) { // something in the pending array, then merge + bsw2entry_t *w = kv_A(stack->pending, pos-1); + if (u->n) { + if (w->n < u->n) { // swap + w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w; + } + merge_entry(opt, w, u, b); + } + if (cnt == 0) { // move from pending to stack0 + remove_duplicate(w, rhash); + save_narrow_hits(target, w, b1, opt->t, opt->is); + cut_tail(w, opt->z, u); + stack_push0(stack, w); + kv_A(stack->pending, pos-1) = 0; + --stack->n_pending; + } + mp_free(stack->pool, u); + } else if (cnt) { // the first time + if (u->n) { // push to the pending queue + ++stack->n_pending; + kv_push(bsw2entry_p, stack->pending, u); + kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt; + } else mp_free(stack->pool, u); + } else { // cnt == 0, then push to the stack + bsw2entry_t *w = mp_alloc(stack->pool); + save_narrow_hits(target, u, b1, opt->t, opt->is); + cut_tail(u, opt->z, w); + mp_free(stack->pool, w); + stack_push0(stack, u); + } + } + } // ~for(tj) + mp_free(stack->pool, v); + } // while(top) + getrusage(0, &curr); + for (i = 0; i < 2; ++i) + for (j = 0; j < b_ret[i]->n; ++j) + b_ret[i]->hits[j].n_seeds = 0; + bsw2_resolve_duphits(bns, query, b, opt->is); + bsw2_resolve_duphits(bns, query, b1, opt->is); + //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); + // free + free(heap); + kh_destroy(qintv, rhash); + kh_destroy(64, chash); + stack->pending.n = stack->stack0.n = 0; + return b_ret; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtsw2_main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtsw2_main.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include "bwt.h" +#include "bwtsw2.h" +#include "utils.h" + +int bwa_bwtsw2(int argc, char *argv[]) +{ + extern char *bwa_infer_prefix(const char *hint); + bsw2opt_t *opt; + bwt_t *target; + char buf[1024], *prefix; + bntseq_t *bns; + int c; + + opt = bsw2_init_opt(); + srand48(11); + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) { + switch (c) { + case 'q': opt->q = atoi(optarg); break; + case 'r': opt->r = atoi(optarg); break; + case 'a': opt->a = atoi(optarg); break; + case 'b': opt->b = atoi(optarg); break; + case 'w': opt->bw = atoi(optarg); break; + case 'T': opt->t = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'z': opt->z = atoi(optarg); break; + case 's': opt->is = atoi(optarg); break; + case 'm': opt->mask_level = atof(optarg); break; + case 'c': opt->coef = atof(optarg); break; + case 'N': opt->t_seeds = atoi(optarg); break; + case 'M': opt->multi_2nd = 1; break; + case 'H': opt->hard_clip = 1; break; + case 'f': xreopen(optarg, "w", stdout); break; + case 'I': opt->max_ins = atoi(optarg); break; + case 'S': opt->skip_sw = 1; break; + } + } + opt->qr = opt->q + opt->r; + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); + fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); + fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); + fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); + fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); + fprintf(stderr, " -w INT band width [%d]\n", opt->bw); + fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); + fprintf(stderr, "\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); + fprintf(stderr, " -M mark multi-part alignments as secondary\n"); + fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); + fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); + fprintf(stderr, "\n"); + fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); + fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); + fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); + fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); + fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); + fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); + fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); + fprintf(stderr, " increase '-z' for better sensitivity.\n"); + fprintf(stderr, "\n"); + + return 1; + } + + // adjust opt for opt->a + opt->t *= opt->a; + opt->coef *= opt->a; + + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 0; + } + strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt")); + strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target); + bns = bns_restore(prefix); + + bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); + + bns_destroy(bns); + bwt_destroy(target); + free(opt); free(prefix); + + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/bwtsw2_pair.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwtsw2_pair.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,291 @@ +#include +#include +#include +#include +#include "bwt.h" +#include "bntseq.h" +#include "bwtsw2.h" +#include "kstring.h" +#ifndef _NO_SSE2 +#include "ksw.h" +#else +#include "stdaln.h" +#endif + +#define MIN_RATIO 0.8 +#define OUTLIER_BOUND 2.0 +#define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 + +typedef struct { + int low, high, failed; + double avg, std; +} bsw2pestat_t; + +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) +{ + extern void ks_introsort_uint64_t(size_t n, uint64_t *a); + int i, k, x, p25, p50, p75, tmp, max_len = 0; + uint64_t *isize; + bsw2pestat_t r; + + memset(&r, 0, sizeof(bsw2pestat_t)); + isize = calloc(n, 8); + for (i = k = 0; i < n; i += 2) { + bsw2hit_t *t[2]; + int l; + if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; + if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough + if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; + if (l >= max_ins) continue; // skip pairs with excessively large insert + max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; + max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; + isize[k++] = l; + } + ks_introsort_uint64_t(k, isize); + p25 = isize[(int)(.25 * k + .499)]; + p50 = isize[(int)(.50 * k + .499)]; + p75 = isize[(int)(.75 * k + .499)]; + ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); + if (k < 8) { + ksprintf(msg, "[%s] fail to infer the insert size distribution.\n", __func__); + free(isize); + r.failed = 1; + return r; + } + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; + r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); + for (i = x = 0, r.avg = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.avg += isize[i], ++x; + r.avg /= x; + for (i = 0, r.std = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.std += (isize[i] - r.avg) * (isize[i] - r.avg); + r.std = sqrt(r.std / x); + ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); + tmp = (int)(p25 - 3. * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; + r.high = (int)(p75 + 3. * (p75 - p25) + .499); + if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); + ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + free(isize); + return r; +} + +typedef struct { + int n_cigar, beg, end, len; + int64_t pos; + uint32_t *cigar; +} pairaux_t; + +extern unsigned char nst_nt4_table[256]; + +void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) +{ + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); + int64_t k, beg, end; + uint8_t *seq, *ref; + int i; + // compute the region start and end + a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 + if (h->is_rev == 0) { + beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); + if (beg < h->k) beg = h->k; + end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); + a->is_rev = 1; a->flag |= 16; + } else { + beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); + end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); + if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); + a->is_rev = 0; + } + if (beg < 1) beg = 1; + if (end > l_pac) end = l_pac; + if (end - beg < l_mseq) return; + // generate the sequence + seq = malloc(l_mseq + (end - beg)); + ref = seq + l_mseq; + for (k = beg; k < end; ++k) + ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; + if (h->is_rev == 0) { + for (i = 0; i < l_mseq; ++i) { // on the reverse strand + int c = nst_nt4_table[(int)mseq[i]]; + seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; + } + } else { + for (i = 0; i < l_mseq; ++i) // on the forward strand + seq[i] = nst_nt4_table[(int)mseq[i]]; + } +#ifndef _NO_SSE2 + { + ksw_query_t *q; + ksw_aux_t aux[2]; + // forward Smith-Waterman + aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; + q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); + ksw_sse2(q, end - beg, ref, &aux[0]); + free(q); + if (aux[0].score < opt->t) { + free(seq); + return; + } + ++aux[0].qe; ++aux[0].te; + // reverse Smith-Waterman + seq_reverse(aux[0].qe, seq, 0); + seq_reverse(aux[0].te, ref, 0); + q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); + ksw_sse2(q, aux[0].te, ref, &aux[1]); + free(q); + ++aux[1].qe; ++aux[1].te; + // write output + a->G = aux[0].score; + a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; + a->k = beg + (aux[0].te - aux[1].te); + a->len = aux[1].te; + a->beg = aux[0].qe - aux[1].qe; + a->end = aux[0].qe; + } +#else + { + AlnParam ap; + path_t path[2]; + int matrix[25]; + for (i = 0; i < 25; ++i) matrix[i] = g_mat[i]; + ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r; + ap.matrix = matrix; ap.row = 5; ap.band_width = 50; + a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); + if (a->G < opt->t) a->G = 0; + if (a->G2 < opt->t) a->G2 = 0; + a->k = beg + path[0].i - 1; + a->len = path[1].i - path[0].i + 1; + a->beg = path[0].j - 1; + a->end = path[1].j; + } +#endif + if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; + free(seq); +} + +void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); + bsw2pestat_t pes; + int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; + int8_t g_mat[25]; + kstring_t msg; + memset(&msg, 0, sizeof(kstring_t)); + pes = bsw2_stat(n, hits, &msg, opt->max_ins); + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + g_mat[k++] = i == j? opt->a : -opt->b; + g_mat[k++] = 0; + } + for (i = 0; i < n; i += 2) { + bsw2hit_t a[2]; + memset(&a, 0, sizeof(bsw2hit_t) * 2); + a[0].flag = 1<<6; a[1].flag = 1<<7; + for (j = 0; j < 2; ++j) { // set the read1/2 flag + if (hits[i+j] == 0) continue; + for (k = 0; k < hits[i+j]->n; ++k) { + bsw2hit_t *p = &hits[i+j]->hits[k]; + p->flag |= 1<<(6+j); + } + } + if (pes.failed) continue; + if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N + if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit + if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit + if (!opt->skip_sw) { + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 + // the following enumerate all possibilities. It is tedious but necessary... + if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; + bwtsw2_t *p[2]; + int which; + if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; + else p[0] = hits[i+1], p[1] = hits[i], which = 0; + if (a[which].G == 0) continue; + a[which].flag |= BSW2_FLAG_RESCUED; + if (p[1]->max == 0) { + p[1]->max = 1; + p[1]->hits = malloc(sizeof(bsw2hit_t)); + } + p[1]->hits[0] = a[which]; + p[1]->n = 1; + p[0]->hits[0].flag |= 2; + p[1]->hits[0].flag |= 2; + ++n_rescued; + } else { // then both ends mapped + int is_fixed = 0; + //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); + for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score + bsw2hit_t *p = &hits[i+j]->hits[0]; + if (p->G < a[j].G) { // the orginal mapping is suboptimal + a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM? + *p = a[j]; + ++n_fixed; + is_fixed = 1; + } else if (p->k != a[j].k && p->G2 < a[j].G) { + p->G2 = a[j].G; + } else if (p->k == a[j].k && p->G2 < a[j].G2) { + p->G2 = a[j].G2; + } + } + if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved + for (j = 0; j < 2; ++j) + hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM); + } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match + for (j = 0; j < 2; ++j) { + hits[i+j]->hits[0].flag |= 2; + if (hits[i+j]->hits[0].k != a[j].k) + hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; + } + } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end + if (a[0].G && a[1].G) { // now we have two "proper pairs" + int G[2]; + double diff; + G[0] = hits[i]->hits[0].G + a[1].G; + G[1] = hits[i+1]->hits[0].G + a[0].G; + diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); + if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; + } + if (a[0].G == 0 || a[1].G == 0) { // one proper pair only + bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved + int which, isize; + double dev, diff; + if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; + else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; + isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; + dev = fabs(isize - pes.avg) / pes.std; + diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; + if (diff < dev * 2.) { // then move (heuristic) + a[which].G2 = a[which].G; + p[1][0] = a[which]; + p[1]->flag |= BSW2_FLAG_MOVED | 2; + p[0]->flag |= 2; + ++n_moved; + } + } + } else if (is_fixed) { + hits[i+0]->hits[0].flag |= 2; + hits[i+1]->hits[0].flag |= 2; + } + } + } + ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); + fputs(msg.s, stderr); + free(msg.s); +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/cs2nt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/cs2nt.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,191 @@ +#include +#include +#include +#include "bwtaln.h" +#include "stdaln.h" + +/* + Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we + decode as ATTGAC(RBGOG), there are one color change and one nt change; + if we decode as ATTAAC(RBRBG), there are two color changes. + + In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM + as the penalty; otherwise, we will use color quality as the + penalty. This means we always prefer two consistent color changes over + a nt change, but if a color has high quality, we may prefer one nt + change. + + In the above example, the penalties of the two types of decoding are + q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first; + otherwise the second. Note that no matter what we choose, the fourth + base will get a low nt quality. + */ + +#define COLOR_MM 19 +#define NUCL_MM 25 + +static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 }; + +/* + {A,C,G,T,N} -> {0,1,2,3,4} + nt_ref[0..size]: nucleotide reference: 0/1/2/3/4 + cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N + nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned) + btarray[0..4*size]: backtrack array (working space) + */ +void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray) +{ + int h[8], curr, last; + int x, y, xmin, hmin, k; + + // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last + + // recursion: initial value + if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2); + else { + for (x = 0; x != 4; ++x) h[x] = NUCL_MM; + h[nt_ref[0]] = 0; + } + // recursion: main loop + curr = 1; last = 0; + for (k = 1; k <= size; ++k) { + for (x = 0; x != 4; ++x) { + int min = 0x7fffffff, ymin = 0; + for (y = 0; y != 4; ++y) { + int s = h[last<<2|y]; + if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<= 0; --k) + nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]]; +} +/* + nt_read[0..size]: nucleotide read sequence: 0/1/2/3 + cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N + tarray[0..size*2-1]: temporary array + */ +uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray) +{ + int k, c1, c2; + uint8_t *t2array = tarray + size; + // get the color sequence of nt_read + c1 = nt_read[0]; + for (k = 1; k <= size; ++k) { + c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case + tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<>6 && tarray[k] == cs_read[k]>>6) { + q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10; + } else if (tarray[k-1] == cs_read[k-1]>>6) { + q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f); + } else if (tarray[k] == cs_read[k]>>6) { + q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f); + } // else, q = 0 + if (q < 0) q = 0; + if (q > 60) q = 60; + t2array[k] = nt_read[k]<<6 | q; + if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0; + } + return t2array + 1; // of size-2 +} + +// this function will be called when p->seq has been reversed by refine_gapped() +void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) +{ + uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read; + int i, len; + uint8_t *seq; + + // set temporary arrays + if (p->type == BWA_TYPE_NO_MATCH) return; + len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space + ta = (uint8_t*)malloc(len * 7); + nt_ref = ta; + cs_read = nt_ref + len; + nt_read = cs_read + len; + btarray = nt_read + len; + tarray = nt_read + len; + +#define __gen_csbase(_cs, _i, _seq) do { \ + int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \ + if (q > 60) q = 60; \ + if (_seq[_i] > 3) q = 63; \ + (_cs) = _seq[_i]<<6 | q; \ + } while (0) + + // generate len, nt_ref[] and cs_read + seq = p->strand? p->rseq : p->seq; + nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4; + if (p->cigar == 0) { // no gap or clipping + len = p->len; + for (i = 0; i < p->len; ++i) { + __gen_csbase(cs_read[i], i, seq); + nt_ref[i+1] = bns_pac(pac, p->pos + i); + } + } else { + int k, z; + bwtint_t x, y; + x = p->pos; y = 0; + for (k = z = 0; k < p->n_cigar; ++k) { + int l = __cigar_len(p->cigar[k]); + if (__cigar_op(p->cigar[k]) == FROM_M) { + for (i = 0; i < l; ++i, ++x, ++y) { + __gen_csbase(cs_read[z], y, seq); + nt_ref[z+1] = bns_pac(pac, x); + ++z; + } + } else if (__cigar_op(p->cigar[k]) == FROM_I) { + for (i = 0; i < l; ++i, ++y) { + __gen_csbase(cs_read[z], y, seq); + nt_ref[z+1] = 4; + ++z; + } + } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l; + else x += l; + } + len = z; + } + + cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray); + new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray); + + // update p + p->len = p->full_len = len - 1; + for (i = 0; i < p->len; ++i) { + if ((new_nt_read[i]&0x3f) == 63) { + p->qual[i] = 33; seq[i] = 4; + } else { + p->qual[i] = (new_nt_read[i]&0x3f) + 33; + seq[i] = new_nt_read[i]>>6; + } + } + p->qual[p->len] = seq[p->len] = 0; + if (p->strand) { + memcpy(p->seq, seq, p->len); + seq_reverse(p->len, p->seq, 1); + seq_reverse(p->len, p->qual, 0); + } else { + memcpy(p->rseq, seq, p->len); + seq_reverse(p->len, p->rseq, 1); + } + free(ta); +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/fastmap.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/fastmap.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include "bntseq.h" +#include "bwt.h" +#include "kvec.h" +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +extern unsigned char nst_nt4_table[256]; + +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + +smem_i *smem_iter_init(const bwt_t *bwt) +{ + smem_i *iter; + iter = calloc(1, sizeof(smem_i)); + iter->bwt = bwt; + iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + iter->matches = calloc(1, sizeof(bwtintv_v)); + return iter; +} + +void smem_iter_destroy(smem_i *iter) +{ + free(iter->tmpvec[0]->a); + free(iter->tmpvec[1]->a); + free(iter->matches->a); + free(iter); +} + +void smem_set_query(smem_i *iter, int len, const uint8_t *query) +{ + iter->query = query; + iter->start = 0; + iter->len = len; +} + +int smem_next(smem_i *iter) +{ + iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0; + if (iter->start >= iter->len || iter->start < 0) return -1; + while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases + if (iter->start == iter->len) return -1; + iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec); + return iter->start; +} + +int main_fastmap(int argc, char *argv[]) +{ + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; + kseq_t *seq; + bwtint_t k; + gzFile fp; + bwt_t *bwt; + bntseq_t *bns; + smem_i *iter; + + while ((c = getopt(argc, argv, "w:l:s")) >= 0) { + switch (c) { + case 's': print_seq = 1; break; + case 'w': min_iwidth = atoi(optarg); break; + case 'l': min_len = atoi(optarg); break; + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + return 1; + } + + fp = gzopen(argv[optind + 1], "r"); + seq = kseq_init(fp); + { // load the packed sequences, BWT and SA + char *tmp = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(tmp, argv[optind]), ".bwt"); + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, argv[optind]), ".sa"); + bwt_restore_sa(tmp, bwt); + free(tmp); + bns = bns_restore(argv[optind]); + } + iter = smem_iter_init(bwt); + while (kseq_read(seq) >= 0) { + printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); + if (print_seq) { + putchar('\t'); + puts(seq->seq.s); + } else putchar('\n'); + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(iter) > 0) { + for (i = 0; i < iter->matches->n; ++i) { + bwtintv_t *p = &iter->matches->a[i]; + if ((uint32_t)p->info - (p->info>>32) < min_len) continue; + printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if (p->x[2] <= min_iwidth) { + for (k = 0; k < p->x[2]; ++k) { + bwtint_t pos; + int len, is_rev, ref_id; + len = (uint32_t)p->info - (p->info>>32); + pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + if (is_rev) pos -= len - 1; + bns_cnt_ambi(bns, pos, len, &ref_id); + printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + } else fputs("\t*", stdout); + putchar('\n'); + } + } + puts("//"); + } + + smem_iter_destroy(iter); + bns_destroy(bns); + bwt_destroy(bwt); + kseq_destroy(seq); + gzclose(fp); + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/is.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/is.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,218 @@ +/* + * sais.c for sais-lite + * Copyright (c) 2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +typedef unsigned char ubyte_t; +#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) + +/* find the start or end of each bucket */ +static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) +{ + int i; + for (i = 0; i < k; ++i) C[i] = 0; + for (i = 0; i < n; ++i) ++C[chr(i)]; +} +static void getBuckets(const int *C, int *B, int k, int end) +{ + int i, sum = 0; + if (end) { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum; + } + } else { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum - C[i]; + } + } +} + +/* compute SA */ +static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) +{ + int *b, i, j; + int c0, c1; + /* compute SAl */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 0); /* find starts of buckets */ + j = n - 1; + b = SA + B[c1 = chr(j)]; + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + for (i = 0; i < n; ++i) { + j = SA[i], SA[i] = ~j; + if (0 < j) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + } + } + /* compute SAs */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { + if (0 < (j = SA[i])) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; + } else SA[i] = ~j; + } +} + +/* + * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working + * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet + */ +static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) +{ + int *C, *B, *RA; + int i, j, c, m, p, q, plen, qlen, name; + int c0, c1; + int diff; + + /* stage 1: reduce the problem by at least 1/2 sort all the + * S-substrings */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = 0; i < n; ++i) SA[i] = 0; + for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) SA[--B[c1]] = i + 1, c = 0; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + /* compact all the sorted substrings into the first m items of SA + * 2*m must be not larger than n (proveable) */ + for (i = 0, m = 0; i < n; ++i) { + p = SA[i]; + if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { + for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); + if ((j < n) && (c0 < c1)) SA[m++] = p; + } + } + for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ + /* store the length of all substrings */ + for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) { + SA[m + ((i + 1) >> 1)] = j - i - 1; + j = i + 1; + c = 0; + } + } + /* find the lexicographic names of all substrings */ + for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { + p = SA[i], plen = SA[m + (p >> 1)], diff = 1; + if (plen == qlen) { + for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); + if (j == plen) diff = 0; + } + if (diff != 0) ++name, q = p, qlen = plen; + SA[m + (p >> 1)] = name; + } + + /* stage 2: solve the reduced problem recurse if names are not yet + * unique */ + if (name < m) { + RA = SA + n + fs - m; + for (i = n - 1, j = m - 1; m <= i; --i) { + if (SA[i] != 0) RA[j--] = SA[i] - 1; + } + if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; + for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ + } + for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ + } + /* stage 3: induce the result for the original problem */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + /* put all left-most S characters into their buckets */ + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ + for (i = m - 1; 0 <= i; --i) { + j = SA[i], SA[i] = 0; + SA[--B[chr(j)]] = j; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + return 0; +} + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred + */ +int is_sa(const ubyte_t *T, int *SA, int n) +{ + if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; + SA[0] = n; + if (n <= 1) { + if (n == 1) SA[1] = 0; + return 0; + } + return sais_main(T, SA+1, 0, n, 256, 1); +} + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +int is_bwt(ubyte_t *T, int n) +{ + int *SA, i, primary = 0; + SA = (int*)calloc(n+1, sizeof(int)); + is_sa(T, SA, n); + + for (i = 0; i <= n; ++i) { + if (SA[i] == 0) primary = i; + else SA[i] = T[SA[i] - 1]; + } + for (i = 0; i < primary; ++i) T[i] = SA[i]; + for (; i < n; ++i) T[i] = SA[i + 1]; + free(SA); + return primary; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/khash.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/khash.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,506 @@ +/* The MIT License + + Copyright (c) 2008, 2009 by attractor + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.4" + +#include +#include +#include + +/* compipler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define inline __inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/kseq.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/kseq.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,208 @@ +/* The MIT License + + Copyright (c) 2008, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/ksort.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/ksort.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,269 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/kstring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/kstring.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,35 @@ +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + ksprintf(s, "abcdefg: %d", 100); + printf("%s\n", s->s); + free(s); + return 0; +} +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/kstring.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/kstring.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,46 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +static inline int kputs(const char *p, kstring_t *s) +{ + int l = strlen(p); + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strcpy(s->s + s->l, p); + s->l += l; + return l; +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +int ksprintf(kstring_t *s, const char *fmt, ...); + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/ksw.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/ksw.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,401 @@ +/* The MIT License + + Copyright (c) 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef _NO_SSE2 +#include +#include +#include +#include "ksw.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +struct _ksw_query_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; + +ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + ksw_query_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0; + uint64_t *b; + __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + + // initialization + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi8(a->gapo + a->gape); + gape = _mm_set1_epi8(a->gape); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, cmp, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo + e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape + e = _mm_max_epu8(e, h); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, gape); + f = _mm_max_epu8(f, h); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, gapoe); + f = _mm_subs_epu8(f, gape); + cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + if (UNLIKELY(cmp == 0xffff)) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + if (imax >= a->T) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + a->score = gmax; a->te = te; + { // get a->qe, the end of query match; find the 2nd best score + int max = -1, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0, a->qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; + //printf("%d,%d\n", max, gmax); + i = (a->score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0, a->score2 = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) + a->score2 = b[i]>>32, a->te2 = e; + } + } + free(b); + return a->score + q->shift >= 255? 255 : a->score; +} + +int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0; + uint64_t *b; + __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi16(a->gapo + a->gape); + gape = _mm_set1_epi16(a->gape); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + e = _mm_subs_epu16(e, gape); + e = _mm_max_epi16(e, h); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, gape); + f = _mm_max_epi16(f, h); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + f = _mm_subs_epu16(f, gape); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + if (imax >= a->T) { + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + } + S = H1; H1 = H0; H0 = S; + } + a->score = gmax; a->te = te; + { + int max = -1, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, a->qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; + i = (a->score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0, a->score2 = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) + a->score2 = b[i]>>32, a->te2 = e; + } + } + free(b); + return a->score; +} + +int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) +{ + if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); + else return ksw_sse2_8(q, tlen, target, a); +} + +/******************************************* + * Main function (not compiled by default) * + *******************************************/ + +#ifdef _KSW_MAIN + +#include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +int main(int argc, char *argv[]) +{ + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; + int8_t mat[25]; + ksw_aux_t a; + gzFile fpt, fpq; + kseq_t *kst, *ksq; + // parse command line + a.gapo = 5; a.gape = 2; a.T = 10; + while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { + switch (c) { + case 'a': sa = atoi(optarg); break; + case 'b': sb = atoi(optarg); break; + case 'q': a.gapo = atoi(optarg); break; + case 'r': a.gape = atoi(optarg); break; + case 't': a.T = atoi(optarg); break; + case 'f': forward_only = 1; break; + case 's': size = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] \n", size, sa, sb, a.gapo, a.gape); + return 1; + } + // initialize scoring matrix + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? sa : -sb; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; + // open file + fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); + fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); + // all-pair alignment + while (kseq_read(ksq) > 0) { + ksw_query_t *q[2]; + for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; + q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + if (!forward_only) { // reverse + for (i = 0; i < ksq->seq.l/2; ++i) { + int t = ksq->seq.s[i]; + ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; + ksq->seq.s[ksq->seq.l-1-i] = t; + } + for (i = 0; i < ksq->seq.l; ++i) + ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + } else q[1] = 0; + gzrewind(fpt); kseq_rewind(kst); + while (kseq_read(kst) > 0) { + int s; + for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); + printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + if (q[1]) { + s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); + printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + } + } + free(q[0]); free(q[1]); + } + kseq_destroy(kst); gzclose(fpt); + kseq_destroy(ksq); gzclose(fpq); + return 0; +} +#endif // _KSW_MAIN +#endif // _NO_SSE2 diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/ksw.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/ksw.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,54 @@ +#ifndef __AC_KSW_H +#define __AC_KSW_H + +struct _ksw_query_t; +typedef struct _ksw_query_t ksw_query_t; + +typedef struct { + // input + unsigned gapo, gape; // the first gap costs gapo+gape + unsigned T; // threshold + // output + int score, te, qe, score2, te2; +} ksw_aux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ + ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free() + + /** + * Compute the maximum local score for queries initialized with ksw_qinit(1, ...) + * + * @param q Query data structure returned by ksw_qinit(1, ...) + * @param tlen Length of the target sequence + * @param target Target sequence + * @param a Auxiliary data structure (see ksw.h) + * + * @return The maximum local score; if the returned value equals 255, the SW may not be finished + */ + int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + + /** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */ + int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + + /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ + int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/kvec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/kvec.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,90 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + (v).a[(v).n++] = (x); \ + } while (0) + +#define kv_pushp(type, v) (((v).n == (v).m)? \ + ((v).m = ((v).m? (v).m<<1 : 2), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : 0), ((v).a + ((v).n++)) + +#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) \ + : 0), (v).a[(i)] + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/main.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,76 @@ +#include +#include +#include "main.h" +#include "utils.h" + +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.6.2-r126" +#endif + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li \n\n"); + fprintf(stderr, "Usage: bwa [options]\n\n"); + fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " aln gapped/ungapped alignment\n"); + fprintf(stderr, " samse generate alignment (single ended)\n"); + fprintf(stderr, " sampe generate alignment (paired ended)\n"); + fprintf(stderr, " bwasw BWA-SW for long queries\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); + fprintf(stderr, " pac2bwt generate BWT from PAC\n"); + fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); + fprintf(stderr, " bwtupdate update .bwt to the new format\n"); + fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); + fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); + fprintf(stderr, " stdsw standard SW/NW alignment\n"); + fprintf(stderr, "\n"); + return 1; +} + +void bwa_print_sam_PG() +{ + printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION); +} + +int main(int argc, char *argv[]) +{ + int i, ret; + double t_real; + t_real = realtime(); + if (argc < 2) return usage(); + if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); + else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); + else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); + else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); + else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); + else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); + else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); + else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); + else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + err_fflush(stdout); + err_fclose(stdout); + if (ret == 0) { + fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); + fprintf(stderr, "[%s] CMD:", __func__); + for (i = 0; i < argc; ++i) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); + } + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/main.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/main.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,30 @@ +#ifndef BWA_MAIN_H +#define BWA_MAIN_H + +#ifdef __cplusplus +extern "C" { +#endif + + int bwa_fa2pac(int argc, char *argv[]); + int bwa_pac2cspac(int argc, char *argv[]); + int bwa_pac2bwt(int argc, char *argv[]); + int bwa_bwtupdate(int argc, char *argv[]); + int bwa_bwt2sa(int argc, char *argv[]); + int bwa_index(int argc, char *argv[]); + int bwa_aln(int argc, char *argv[]); + int bwt_bwtgen_main(int argc, char *argv[]); + + int bwa_sai2sam_se(int argc, char *argv[]); + int bwa_sai2sam_pe(int argc, char *argv[]); + + int bwa_stdsw(int argc, char *argv[]); + + int bwa_bwtsw2(int argc, char *argv[]); + + int main_fastmap(int argc, char *argv[]); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/qualfa2fq.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/qualfa2fq.pl Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +die("Usage: qualfa2fq.pl \n") if (@ARGV != 2); + +my ($fhs, $fhq, $q); +open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; +open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; + +$/ = ">"; <$fhs>; <$fhq>; $/ = "\n"; +while (<$fhs>) { + $q = <$fhq>; + print "\@$_"; + $/ = ">"; + $_ = <$fhs>; $q = <$fhq>; + chomp; chomp($q); + $q =~ s/\s*(\d+)\s*/chr($1+33)/eg; + print $_, "+\n"; + for (my $i = 0; $i < length($q); $i += 60) { + print substr($q, $i, 60), "\n"; + } + $/ = "\n"; +} + +close($fhs); close($fhq); diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/simple_dp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/simple_dp.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include +#include "stdaln.h" +#include "utils.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int l; + unsigned char *s; + char *n; +} seq1_t; + +typedef struct { + int n_seqs, m_seqs; + seq1_t *seqs; +} seqs_t; + +unsigned char aln_rev_table[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', + 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', + 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N', + 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' +}; + +static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0; +static AlnParam g_aln_param; + +static void revseq(int len, uint8_t *seq) +{ + int i; + for (i = 0; i < len>>1; ++i) { + uint8_t tmp = aln_rev_table[seq[len-1-i]]; + seq[len-1-i] = aln_rev_table[seq[i]]; + seq[i] = tmp; + } + if (len&1) seq[i] = aln_rev_table[seq[i]]; +} + +static seqs_t *load_seqs(const char *fn) +{ + seqs_t *s; + seq1_t *p; + gzFile fp; + int l; + kseq_t *seq; + + fp = xzopen(fn, "r"); + seq = kseq_init(fp); + s = (seqs_t*)calloc(1, sizeof(seqs_t)); + s->m_seqs = 256; + s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); + while ((l = kseq_read(seq)) >= 0) { + if (s->n_seqs == s->m_seqs) { + s->m_seqs <<= 1; + s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); + } + p = s->seqs + (s->n_seqs++); + p->l = seq->seq.l; + p->s = (unsigned char*)malloc(p->l + 1); + memcpy(p->s, seq->seq.s, p->l); + p->s[p->l] = 0; + p->n = strdup((const char*)seq->name.s); + } + kseq_destroy(seq); + gzclose(fp); + fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); + return s; +} + +static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) +{ + int i; + for (i = 0; i < ss->n_seqs; ++i) { + AlnAln *aa; + seq1_t *p = ss->seqs + i; + g_aln_param.band_width = l + p->l; + aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); + if (aa->score >= g_thres || g_is_global) { + printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, + aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); + // NB: I put the short sequence as the first sequence in SW, an insertion to + // the reference becomes a deletion from the short sequence. Therefore, I use + // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. + for (i = 0; i != aa->n_cigar; ++i) + printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); + printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); + } + aln_free_AlnAln(aa); + } +} + +static void aln_seqs(const seqs_t *ss, const char *fn) +{ + gzFile fp; + kseq_t *seq; + int l; + + fp = xzopen(fn, "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+'); + if (g_strand&2) { + revseq(l, (uint8_t*)seq->seq.s); + aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-'); + } + } + kseq_destroy(seq); + gzclose(fp); +} + +int bwa_stdsw(int argc, char *argv[]) +{ + int c; + seqs_t *ss; + + while ((c = getopt(argc, argv, "gT:frp")) >= 0) { + switch (c) { + case 'g': g_is_global = 1; break; + case 'T': g_thres = atoi(optarg); break; + case 'f': g_strand |= 1; break; + case 'r': g_strand |= 2; break; + case 'p': g_aa = 1; break; + } + } + if (g_strand == 0) g_strand = 3; + if (g_aa) g_strand = 1; + if (optind + 1 >= argc) { + fprintf(stderr, "\nUsage: bwa stdsw [options] \n\n"); + fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres); + fprintf(stderr, " -p protein alignment (suppressing -r)\n"); + fprintf(stderr, " -f forward strand only\n"); + fprintf(stderr, " -r reverse strand only\n"); + fprintf(stderr, " -g global alignment\n\n"); + fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n"); + fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n"); + fprintf(stderr, " sequence.\n\n"); + return 1; + } + g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast; + g_aln_param.gap_end = 0; + ss = load_seqs(argv[optind]); + aln_seqs(ss, argv[optind+1]); + return 0; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/solid2fastq.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/solid2fastq.pl Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,111 @@ +#!/usr/bin/perl -w + +# Author: lh3 +# Note: Ideally, this script should be written in C. It is a bit slow at present. +# Also note that this script is different from the one contained in MAQ. + +use strict; +use warnings; +use Getopt::Std; + +my %opts; +my $version = '0.1.4'; +my $usage = qq{ +Usage: solid2fastq.pl + +Note: is the string showed in the `# Title:' line of a + ".csfasta" read file. Then F3.csfasta is read sequence + file and F3_QV.qual is the quality file. If + R3.csfasta is present, this script assumes reads are + paired; otherwise reads will be regarded as single-end. + + The read name will be :panel_x_y/[12] with `1' for R3 + tag and `2' for F3. Usually you may want to use short + to save diskspace. Long also causes troubles to maq. + +}; + +getopts('', \%opts); +die($usage) if (@ARGV != 2); +my ($title, $pre) = @ARGV; +my (@fhr, @fhw); +my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); +my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; +if ($is_paired) { # paired end + for (0 .. 3) { + my $fn = "$title$fn_suff[$_]"; + $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); + open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); + } + open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo + open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; + open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; + my (@df, @dr); + @df = &read1(1); @dr = &read1(2); + while (@df && @dr) { + if ($df[0] eq $dr[0]) { # mate pair + print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; + @df = &read1(1); @dr = &read1(2); + } else { + if ($df[0] le $dr[0]) { + print {$fhw[2]} $df[1]; + @df = &read1(1); + } else { + print {$fhw[2]} $dr[1]; + @dr = &read1(2); + } + } + } + if (@df) { + print {$fhw[2]} $df[1]; + while (@df = &read1(1, $fhr[0], $fhr[1])) { + print {$fhw[2]} $df[1]; + } + } + if (@dr) { + print {$fhw[2]} $dr[1]; + while (@dr = &read1(2, $fhr[2], $fhr[3])) { + print {$fhw[2]} $dr[1]; + } + } + close($fhr[$_]) for (0 .. $#fhr); + close($fhw[$_]) for (0 .. $#fhw); +} else { # single end + for (0 .. 1) { + my $fn = "$title$fn_suff[$_]"; + $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); + open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); + } + open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; + my @df; + while (@df = &read1(1, $fhr[0], $fhr[1])) { + print {$fhw[2]} $df[1]; + } + close($fhr[$_]) for (0 .. $#fhr); + close($fhw[2]); +} + +sub read1 { + my $i = shift(@_); + my $j = ($i-1)<<1; + my ($key, $seq); + my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); + while (<$fhs>) { + my $t = <$fhq>; + if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { + $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines + die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); + my $name = "$pre:$1_$2_$3/$i"; + $_ = substr(<$fhs>, 2); + tr/0123./ACGTN/; + my $s = $_; + $_ = <$fhq>; + s/-1\b/0/eg; + s/^(\d+)\s*//; + s/(\d+)\s*/chr($1+33)/eg; + $seq = qq/\@$name\n$s+\n$_\n/; + last; + } + } + return defined($seq)? ($key, $seq) : (); +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/stdaln.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/stdaln.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,1072 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include "stdaln.h" + +/* char -> 17 (=16+1) nucleotides */ +unsigned char aln_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, + 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, + 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, + 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; +char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; + +/* char -> 5 (=4+1) nucleotides */ +unsigned char aln_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; +char *aln_nt4_rev_table = "AGCTN-"; + +/* char -> 22 (=20+1+1) amino acids */ +unsigned char aln_aa_table[256] = { + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, + 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, + 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, + 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 +}; +char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; + /* 01234567890123456789012 */ + +/* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ +unsigned char aln_trans_table_eu[66] = { + 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, + 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, + 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, + 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 +}; +char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; + /* 01234567890123456789012345678901234567890123456789012345678901234 */ +int aln_sm_blosum62[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, + -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, + -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, + -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, + 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, + -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, + -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, + 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, + -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, + -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, + -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, + -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, + -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, + -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, + -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, + -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, + -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, + 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, + -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 +}; + +int aln_sm_blosum45[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, + -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, + -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, + -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, + -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, + -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, + -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, + 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, + -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, + -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, + -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, + -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, + -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, + -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, + -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, + -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, + -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, + 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, + -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 +}; + +int aln_sm_nt[] = { +/* X A G R C M S V T W K D Y H B N */ + -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, + -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, + -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, + -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, + -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, + -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, + -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, + -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, + -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, + -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, + -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, + -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, + -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +int aln_sm_read[] = { +/* X A G R C M S V T W K D Y H B N */ + -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, + -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, + -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, + -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, + -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, + -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, + -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, + -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, + -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, + -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, + -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, + -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, + -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +int aln_sm_hs[] = { +/* A G C T N */ + 91, -31,-114,-123, -44, + -31, 100,-125,-114, -42, + -123,-125, 100, -31, -42, + -114,-114, -31, 91, -42, + -44, -42, -42, -42, -43 +}; + +int aln_sm_maq[] = { + 11, -19, -19, -19, -13, + -19, 11, -19, -19, -13, + -19, -19, 11, -19, -13, + -19, -19, -19, 11, -13, + -13, -13, -13, -13, -13 +}; + +int aln_sm_blast[] = { + 1, -3, -3, -3, -2, + -3, 1, -3, -3, -2, + -3, -3, 1, -3, -2, + -3, -3, -3, 1, -2, + -2, -2, -2, -2, -2 +}; + +/********************/ +/* START OF align.c */ +/********************/ + +AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; +AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 }; +AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; +AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; +AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; + +AlnAln *aln_init_AlnAln() +{ + AlnAln *aa; + aa = (AlnAln*)malloc(sizeof(AlnAln)); + aa->path = 0; + aa->out1 = aa->out2 = aa->outm = 0; + aa->path_len = 0; + return aa; +} +void aln_free_AlnAln(AlnAln *aa) +{ + free(aa->path); free(aa->cigar32); + free(aa->out1); free(aa->out2); free(aa->outm); + free(aa); +} + +/***************************/ +/* START OF common_align.c */ +/***************************/ + +#define LOCAL_OVERFLOW_THRESHOLD 32000 +#define LOCAL_OVERFLOW_REDUCE 16000 +#define NT_LOCAL_SCORE int +#define NT_LOCAL_SHIFT 16 +#define NT_LOCAL_MASK 0xffff + +#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; + +#define set_M(MM, cur, p, sc) \ +{ \ + if ((p)->M >= (p)->I) { \ + if ((p)->M >= (p)->D) { \ + (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } else { \ + if ((p)->I > (p)->D) { \ + (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } \ +} +#define set_I(II, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_ext; \ + } \ +} +#define set_end_I(II, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_end; \ + } \ + } else set_I(II, cur, p); \ +} +#define set_D(DD, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_ext; \ + } \ +} +#define set_end_D(DD, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_end; \ + } \ + } else set_D(DD, cur, p); \ +} + +typedef struct +{ + unsigned char Mt:3, It:2, Dt:2; +} dpcell_t; + +typedef struct +{ + int M, I, D; +} dpscore_t; + +/* build score profile for accelerating alignment, in theory */ +void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) +{ + int *tmp, *tmp2, i, k; + for (i = 0; i != row; ++i) { + tmp = score_matrix + i * row; + tmp2 = s_array[i]; + for (k = 0; k != len; ++k) + tmp2[k] = tmp[seq[k]]; + } +} +/*************************** + * banded global alignment * + ***************************/ +int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len) +{ + register int i, j; + dpcell_t **dpcell, *q; + dpscore_t *curr, *last, *s; + path_t *p; + int b1, b2, tmp_end; + int *mat, end, max; + unsigned char type, ctype; + + int gap_open, gap_ext, gap_end, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + gap_end = ap->gap_end; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (len1 == 0 || len2 == 0) { + *path_len = 0; + return 0; + } + /* calculate b1 and b2 */ + if (len1 > len2) { + b1 = len1 - len2 + b; + b2 = b; + } else { + b1 = b; + b2 = len2 - len1 + b; + } + if (b1 > len1) b1 = len1; + if (b2 > len2) b2 = len2; + --seq1; --seq2; + + /* allocate memory */ + end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); + dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + for (j = 0; j <= len2; ++j) + dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] -= j - b2; + curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + + /* set first row */ + SET_INF(*curr); curr->M = 0; + for (i = 1, s = curr + 1; i < b1; ++i, ++s) { + SET_INF(*s); + set_end_D(s->D, dpcell[0] + i, s - 1); + } + s = curr; curr = last; last = s; + + /* core dynamic programming, part 1 */ + tmp_end = (b2 < len2)? b2 : len2 - 1; + for (j = 1; j <= tmp_end; ++j) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + /* last row for part 1, use set_end_D() instead of set_D() */ + if (j == len2 && b2 != len2 - 1) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_end_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + ++j; + } + + /* core dynamic programming, part 2 */ + for (; j <= len2 - b2 + 1; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + end = j + b1 - 1; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + + /* core dynamic programming, part 3 */ + for (; j < len2; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + /* last row */ + if (j == len2) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + + /* backtrace */ + i = len1; j = len2; + q = dpcell[j] + i; + s = last + len1; + max = s->M; type = q->Mt; ctype = FROM_M; + if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } + if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } + + p = path; + p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ + ++p; + do { + switch (ctype) { + case FROM_M: --i; --j; break; + case FROM_I: --j; break; + case FROM_D: --i; break; + } + q = dpcell[j] + i; + ctype = type; + switch (type) { + case FROM_M: type = q->Mt; break; + case FROM_I: type = q->It; break; + case FROM_D: type = q->Dt; break; + } + p->ctype = ctype; p->i = i; p->j = j; + ++p; + } while (i || j); + *path_len = p - path - 1; + + /* free memory */ + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] += j - b2; + for (j = 0; j <= len2; ++j) + free(dpcell[j]); + free(dpcell); + free(curr); free(last); + + return max; +} +/************************************************* + * local alignment combined with banded strategy * + *************************************************/ +int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int _thres, int *_subo) +{ + register NT_LOCAL_SCORE *s; + register int i; + int q, r, qr, tmp_len, qr_shift; + int **s_array, *score_array; + int e, f; + int is_overflow, of_base; + NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; + int j, start_i, start_j, end_i, end_j; + path_t *p; + int score_f, score_r, score_g; + int start, end, max_score; + int thres, *suba, *ss; + + int gap_open, gap_ext, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + thres = _thres > 0? _thres : -_thres; + + if (len1 == 0 || len2 == 0) return -1; + + /* allocate memory */ + suba = (int*)malloc(sizeof(int) * (len2 + 1)); + eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); + s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW); + for (i = 0; i != N_MATRIX_ROW; ++i) + s_array[i] = (int*)malloc(sizeof(int) * len1); + /* initialization */ + aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); + q = gap_open; + r = gap_ext; + qr = q + r; + qr_shift = (qr+1) << NT_LOCAL_SHIFT; + tmp_len = len1 + 1; + start_i = start_j = end_i = end_j = 0; + for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) + if (max_score < score_matrix[i]) max_score = score_matrix[i]; + /* convert the coordinate */ + --seq1; --seq2; + for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; + + /* forward dynamic programming */ + for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; + score_f = 0; + is_overflow = of_base = 0; + suba[0] = 0; + for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) { + int subo = 0; + last_h = f = 0; + score_array = s_array[seq2[j]]; + if (is_overflow) { /* adjust eh[] array if overflow occurs. */ + /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. + * If it is too large, this block will be excuted frequently and therefore + * slow down the whole program. + * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the + * number of assignments because it sets some cells to zero when overflow + * happens. */ + int tmp, tmp2; + score_f -= LOCAL_OVERFLOW_REDUCE; + of_base += LOCAL_OVERFLOW_REDUCE; + is_overflow = 0; + for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { + tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; + if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; + else tmp2 -= LOCAL_OVERFLOW_REDUCE; + if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; + else tmp -= LOCAL_OVERFLOW_REDUCE; + *s = (tmp << NT_LOCAL_SHIFT) | tmp2; + } + } + for (i = 1, s = eh; i != tmp_len; ++i, ++s) { + /* prepare for calculate current h */ + curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; + if (curr_h < 0) curr_h = 0; + if (last_h > 0) { /* initialize f */ + f = (f > last_h - q)? f - r : last_h - qr; + if (curr_h < f) curr_h = f; + } + if (*(s+1) >= qr_shift) { /* initialize e */ + curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; + e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; + if (curr_h < e) curr_h = e; + *s = (last_h << NT_LOCAL_SHIFT) | e; + } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ + last_h = curr_h; + if (subo < curr_h) subo = curr_h; + if (score_f < curr_h) { + score_f = curr_h; end_i = i; end_j = j; + if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; + } + } + *s = last_h << NT_LOCAL_SHIFT; + *ss = subo + of_base; + } + score_f += of_base; + + if (score_f < thres) { /* no matching residue at all, 090218 */ + if (path_len) *path_len = 0; + goto end_func; + } + if (path == 0) goto end_func; /* skip path-filling */ + + /* reverse dynamic programming */ + for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; + if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ + score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; + is_overflow = of_base = 0; + start_i = end_i; start_j = end_j; + eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ + start = end_i - 1; + end = end_i - 3; + if (end <= 0) end = 0; + + /* second pass DP can be done in a band, speed will thus be enhanced */ + for (j = end_j - 1; j != 0; --j) { + last_h = f = 0; + score_array = s_array[seq2[j]]; + if (is_overflow) { /* adjust eh[] array if overflow occurs. */ + int tmp, tmp2; + score_r -= LOCAL_OVERFLOW_REDUCE; + of_base += LOCAL_OVERFLOW_REDUCE; + is_overflow = 0; + for (i = start, s = eh + start + 1; i >= end; --i, --s) { + tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; + if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; + else tmp2 -= LOCAL_OVERFLOW_REDUCE; + if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; + else tmp -= LOCAL_OVERFLOW_REDUCE; + *s = (tmp << NT_LOCAL_SHIFT) | tmp2; + } + } + for (i = start, s = eh + start + 1; i != end; --i, --s) { + /* prepare for calculate current h */ + curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; + if (curr_h < 0) curr_h = 0; + if (last_h > 0) { /* initialize f */ + f = (f > last_h - q)? f - r : last_h - qr; + if (curr_h < f) curr_h = f; + } + curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; + e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; + if (e < 0) e = 0; + if (curr_h < e) curr_h = e; + *s = (last_h << NT_LOCAL_SHIFT) | e; + last_h = curr_h; + if (score_r < curr_h) { + score_r = curr_h; start_i = i; start_j = j; + if (score_r + of_base - qr == score_f) { + j = 1; break; + } + if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; + } + } + *s = last_h << NT_LOCAL_SHIFT; + /* recalculate start and end, the boundaries of the band */ + if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; + if (start <= 0) start = 0; + end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; + if (end <= 0) end = 0; + } + + if (_subo) { + int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499); + for (j = 1; j <= tmp; ++j) + if (tmp2 < suba[j]) tmp2 = suba[j]; + tmp = (int)(end_j + .33 * (end_j - start_j) + .499); + for (j = tmp; j <= len2; ++j) + if (tmp2 < suba[j]) tmp2 = suba[j]; + *_subo = tmp2; + } + + if (path_len == 0) { + path[0].i = start_i; path[0].j = start_j; + path[1].i = end_i; path[1].j = end_j; + goto end_func; + } + + score_r += of_base; + score_r -= qr; + +#ifdef DEBUG + /* this seems not a bug */ + if (score_f != score_r) + fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); +#endif + + if (_thres > 0) { /* call global alignment to fill the path */ + score_g = 0; + j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; + ++j; /* j is the maximum band_width */ + for (i = ap->band_width;; i <<= 1) { + AlnParam ap_real = *ap; + ap_real.gap_end = -1; + ap_real.band_width = i; + score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, + end_j - start_j + 1, &ap_real, path, path_len); + if (score_g == score_r || score_f == score_g) break; + if (i > j) break; + } + if (score_r > score_g && score_f > score_g) { + fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g); + score_f = score_r = -1; + } else score_f = score_g; + + /* convert coordinate */ + for (p = path + *path_len - 1; p >= path; --p) { + p->i += start_i - 1; + p->j += start_j - 1; + } + } else { /* just store the start and end */ + *path_len = 2; + path[1].i = start_i; path[1].j = start_j; + path->i = end_i; path->j = end_j; + } + +end_func: + /* free */ + free(eh); free(suba); + for (i = 0; i != N_MATRIX_ROW; ++i) { + ++s_array[i]; + free(s_array[i]); + } + free(s_array); + return score_f; +} +AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, + int type, int thres, int len1, int len2) +{ + unsigned char *seq11, *seq22; + int score; + int i, j, l; + path_t *p; + char *out1, *out2, *outm; + AlnAln *aa; + + if (len1 < 0) len1 = strlen(seq1); + if (len2 < 0) len2 = strlen(seq2); + + aa = aln_init_AlnAln(); + seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1); + seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2); + aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1)); + + if (ap->row < 10) { /* 4-nucleotide alignment */ + for (i = 0; i < len1; ++i) + seq11[i] = aln_nt4_table[(int)seq1[i]]; + for (j = 0; j < len2; ++j) + seq22[j] = aln_nt4_table[(int)seq2[j]]; + } else if (ap->row < 20) { /* 16-nucleotide alignment */ + for (i = 0; i < len1; ++i) + seq11[i] = aln_nt16_table[(int)seq1[i]]; + for (j = 0; j < len2; ++j) + seq22[j] = aln_nt16_table[(int)seq2[j]]; + } else { /* amino acids */ + for (i = 0; i < len1; ++i) + seq11[i] = aln_aa_table[(int)seq1[i]]; + for (j = 0; j < len2; ++j) + seq22[j] = aln_aa_table[(int)seq2[j]]; + } + + if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); + else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo); + else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0); + else { + free(seq11); free(seq22); free(aa->path); + aln_free_AlnAln(aa); + return 0; + } + aa->score = score; + + if (thres > 0) { + out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + + --seq1; --seq2; + --seq11; --seq22; + + p = aa->path + aa->path_len - 1; + + for (l = 0; p >= aa->path; --p, ++l) { + switch (p->ctype) { + case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; + outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; + break; + case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; + case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; + } + } + out1[l] = out2[l] = outm[l] = '\0'; + ++seq11; ++seq22; + } + + free(seq11); + free(seq22); + + p = aa->path + aa->path_len - 1; + aa->start1 = p->i? p->i : 1; + aa->end1 = aa->path->i; + aa->start2 = p->j? p->j : 1; + aa->end2 = aa->path->j; + aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar); + + return aa; +} +AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres) +{ + return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1); +} + +/* for backward compatibility */ +uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) +{ + uint32_t *cigar32; + uint16_t *cigar; + int i; + cigar32 = aln_path2cigar32(path, path_len, n_cigar); + cigar = (uint16_t*)cigar32; + for (i = 0; i < *n_cigar; ++i) + cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff); + return cigar; +} + +/* newly added functions (2009-07-21) */ + +int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int G0, uint8_t *_mem) +{ + int q, r, qr, tmp_len; + int32_t **s_array, *score_array; + int is_overflow, of_base; + uint32_t *eh; + int i, j, end_i, end_j; + int score, start, end; + int *score_matrix, N_MATRIX_ROW; + uint8_t *mem, *_p; + + /* initialize some align-related parameters. just for compatibility */ + q = ap->gap_open; + r = ap->gap_ext; + qr = q + r; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (len1 == 0 || len2 == 0) return -1; + + /* allocate memory */ + mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); + _p = mem; + eh = (uint32_t*)_p, _p += 4 * (len1 + 2); + s_array = calloc(N_MATRIX_ROW, sizeof(void*)); + for (i = 0; i != N_MATRIX_ROW; ++i) + s_array[i] = (int32_t*)_p, _p += 4 * len1; + /* initialization */ + aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); + tmp_len = len1 + 1; + start = 1; end = 2; + end_i = end_j = 0; + score = 0; + is_overflow = of_base = 0; + /* convert the coordinate */ + --seq1; --seq2; + for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; + + /* dynamic programming */ + memset(eh, 0, 4 * (len1 + 2)); + eh[1] = (uint32_t)G0<<16; + for (j = 1; j <= len2; ++j) { + int _start, _end; + int h1 = 0, f = 0; + score_array = s_array[seq2[j]]; + /* set start and end */ + _start = j - ap->band_width; + if (_start < 1) _start = 1; + if (_start > start) start = _start; + _end = j + ap->band_width; + if (_end > len1 + 1) _end = len1 + 1; + if (_end < end) end = _end; + if (start == end) break; + /* adjust eh[] array if overflow occurs. */ + if (is_overflow) { + int tmp, tmp2; + score -= LOCAL_OVERFLOW_REDUCE; + of_base += LOCAL_OVERFLOW_REDUCE; + is_overflow = 0; + for (i = start; i <= end; ++i) { + uint32_t *s = &eh[i]; + tmp = *s >> 16; tmp2 = *s & 0xffff; + if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; + else tmp2 -= LOCAL_OVERFLOW_REDUCE; + if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; + else tmp -= LOCAL_OVERFLOW_REDUCE; + *s = (tmp << 16) | tmp2; + } + } + _start = _end = 0; + /* the inner loop */ + for (i = start; i < end; ++i) { + /* At the beginning of each cycle: + eh[i] -> h[j-1,i-1]<<16 | e[j,i] + f -> f[j,i] + h1 -> h[j,i-1] + */ + uint32_t *s = &eh[i]; + int h = (int)(*s >> 16); + int e = *s & 0xffff; /* this is e[j,i] */ + *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */ + h += h? score_array[i] : 0; /* this is left_core() specific */ + /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */ + h = h > e? h : e; + h = h > f? h : f; /* h now is h[j,i] */ + h1 = h; + if (h > 0) { + if (_start == 0) _start = i; + _end = i; + if (score < h) { + score = h; end_i = i; end_j = j; + if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; + } + } + /* calculate e[j+1,i] and f[j,i+1] */ + h -= qr; + h = h > 0? h : 0; + e -= r; + e = e > h? e : h; + f -= r; + f = f > h? f : h; + *s |= e; + } + eh[end] = h1 << 16; + /* recalculate start and end, the boundaries of the band */ + if (_end <= 0) break; /* no cell in this row has a positive score */ + start = _start; + end = _end + 3; + } + + score += of_base - 1; + if (score <= 0) { + if (path_len) *path_len = 0; + goto end_left_func; + } + + if (path == 0) goto end_left_func; + + if (path_len == 0) { + path[0].i = end_i; path[0].j = end_j; + goto end_left_func; + } + + { /* call global alignment to fill the path */ + int score_g = 0; + j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1; + ++j; /* j is the maximum band_width */ + for (i = ap->band_width;; i <<= 1) { + AlnParam ap_real = *ap; + ap_real.gap_end = -1; + ap_real.band_width = i; + score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len); + if (score == score_g) break; + if (i > j) break; + } + if (score > score_g) + fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score); + score = score_g; + } + +end_left_func: + /* free */ + free(s_array); + if (!_mem) free(mem); + return score; +} + +uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) +{ + int i, n; + uint32_t *cigar; + unsigned char last_type; + + if (path_len == 0 || path == 0) { + *n_cigar = 0; + return 0; + } + + last_type = path->ctype; + for (i = n = 1; i < path_len; ++i) { + if (last_type != path[i].ctype) ++n; + last_type = path[i].ctype; + } + *n_cigar = n; + cigar = (uint32_t*)malloc(*n_cigar * 4); + + cigar[0] = 1u << 4 | path[path_len-1].ctype; + last_type = path[path_len-1].ctype; + for (i = path_len - 2, n = 0; i >= 0; --i) { + if (path[i].ctype == last_type) cigar[n] += 1u << 4; + else { + cigar[++n] = 1u << 4 | path[i].ctype; + last_type = path[i].ctype; + } + } + + return cigar; +} + +#ifdef STDALN_MAIN +int main() +{ + AlnAln *aln_local, *aln_global, *aln_left; + int i; + + aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1); + aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1); +// aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1); + aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC", + "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1); + + printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2); + printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2); + + printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2); + for (i = 0; i != aln_global->n_cigar; ++i) + printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]); + printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2); + + printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2); + for (i = 0; i != aln_left->n_cigar; ++i) + printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]); + printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2); + + aln_free_AlnAln(aln_local); + aln_free_AlnAln(aln_global); + aln_free_AlnAln(aln_left); + return 0; +} +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/stdaln.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/stdaln.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,162 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2009-07-23, 0.10.0 + + - Use 32-bit to store CIGAR + + - Report suboptimal aligments + + - Implemented half-fixed-half-open DP + + 2009-04-26, 0.9.10 + + - Allow to set a threshold for local alignment + + 2009-02-18, 0.9.9 + + - Fixed a bug when no residue matches + + 2008-08-04, 0.9.8 + + - Fixed the wrong declaration of aln_stdaln_aux() + + - Avoid 0 coordinate for global alignment + + 2008-08-01, 0.9.7 + + - Change gap_end penalty to 5 in aln_param_bwa + + - Add function to convert path_t to the CIGAR format + + 2008-08-01, 0.9.6 + + - The first gap now costs (gap_open+gap_ext), instead of + gap_open. Scoring systems are modified accordingly. + + - Gap end is now correctly handled. Previously it is not correct. + + - Change license to MIT. + + */ + +#ifndef LH3_STDALN_H_ +#define LH3_STDALN_H_ + + +#define STDALN_VERSION 0.11.0 + +#include + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + +#define ALN_TYPE_LOCAL 0 +#define ALN_TYPE_GLOBAL 1 +#define ALN_TYPE_EXTEND 2 + +/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ +#define MINOR_INF -1073741823 + +typedef struct +{ + int gap_open; + int gap_ext; + int gap_end; + + int *matrix; + int row; + int band_width; +} AlnParam; + +typedef struct +{ + int i, j; + unsigned char ctype; +} path_t; + +typedef struct +{ + path_t *path; /* for advanced users... :-) */ + int path_len; /* for advanced users... :-) */ + int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ + int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ + int score, subo; /* score */ + + char *out1, *out2; /* print them, and then you will know */ + char *outm; + + int n_cigar; + uint32_t *cigar32; +} AlnAln; + +#ifdef __cplusplus +extern "C" { +#endif + + AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, + int type, int do_align, int len1, int len2); + AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align); + void aln_free_AlnAln(AlnAln *aa); + + int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len); + int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int _thres, int *_subo); + int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int G0, uint8_t *_mem); + uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); + uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +/******************** + * global variables * + ********************/ + +extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */ +extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ +extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ +extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ +extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ + +/* common nucleotide score matrix for 16 bases */ +extern int aln_sm_nt[], aln_sm_bwa[]; + +/* BLOSUM62 and BLOSUM45 */ +extern int aln_sm_blosum62[], aln_sm_blosum45[]; + +/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ +extern int aln_sm_read[]; + +/* human-mouse score matrix for 4 bases */ +extern int aln_sm_hs[]; + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/utils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/utils.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,164 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" + +FILE *err_xopen_core(const char *func, const char *fn, const char *mode) +{ + FILE *fp = 0; + if (strcmp(fn, "-") == 0) + return (strstr(mode, "r"))? stdin : stdout; + if ((fp = fopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} +FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) +{ + if (freopen(fn, mode, fp) == 0) { + fprintf(stderr, "[%s] fail to open file '%s': ", func, fn); + perror(NULL); + fprintf(stderr, "Abort!\n"); + abort(); + } + return fp; +} +gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) +{ + gzFile fp; + if (strcmp(fn, "-") == 0) + return gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if ((fp = gzopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} +void err_fatal(const char *header, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "[%s] ", header); + vfprintf(stderr, fmt, args); + fprintf(stderr, " Abort!\n"); + va_end(args); + abort(); +} + +void err_fatal_simple_core(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s Abort!\n", func, msg); + abort(); +} + +size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + { + err_fatal_simple_core("fwrite", strerror(errno)); + } + return ret; +} + +int err_printf(const char *format, ...) +{ + va_list arg; + int done; + + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + + if (done < 0) + { + err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); + } + return done; +} + +int err_fprintf(FILE *stream, const char *format, ...) +{ + va_list arg; + int done; + + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + + if (done < 0) + { + err_fatal_simple_core("vfprintf", strerror(saveErrno)); + } + return done; +} + +int err_fflush(FILE *stream) +{ + int ret = fflush(stream); + if (ret != 0) + { + err_fatal_simple_core("fflush", strerror(errno)); + } + return ret; +} + +int err_fclose(FILE *stream) +{ + int ret = fclose(stream); + if (ret != 0) + { + err_fatal_simple_core("fclose", strerror(errno)); + } + return ret; +} + +double cputime() +{ + struct rusage r; + getrusage(RUSAGE_SELF, &r); + return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); +} + +double realtime() +{ + struct timeval tp; + struct timezone tzp; + gettimeofday(&tp, &tzp); + return tp.tv_sec + tp.tv_usec * 1e-6; +} diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/utils.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/utils.h Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,73 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef LH3_UTILS_H +#define LH3_UTILS_H + +#include +#include + +#ifdef __GNUC__ +// Tell GCC to validate printf format string and args +#define ATTRIBUTE(list) __attribute__ (list) +#else +#define ATTRIBUTE(list) +#endif + + + +#define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) +#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) +#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) +#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) +#define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) + +#ifdef __cplusplus +extern "C" { +#endif + + void err_fatal(const char *header, const char *fmt, ...); + void err_fatal_simple_core(const char *func, const char *msg); + FILE *err_xopen_core(const char *func, const char *fn, const char *mode); + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); + size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); + int err_fprintf(FILE *stream, const char *format, ...) + ATTRIBUTE((format(printf, 2, 3))); + int err_printf(const char *format, ...) + ATTRIBUTE((format(printf, 1, 2))); + int err_fflush(FILE *stream); + int err_fclose(FILE *stream); + + double cputime(); + double realtime(); + +#ifdef __cplusplus +} +#endif + +#endif diff -r a9636dc1e99a -r a294fbfcb1db bwa-0.6.2/xa2multi.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/xa2multi.pl Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +while (<>) { + if (/\tXA:Z:(\S+)/) { + my $l = $1; + print; + my @t = split("\t"); + while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { + my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! + my $seq = $t[9]; + my $phred = $t[10]; + # if alternative alignment has other orientation than primary, + # then print the reverse (complement) of sequence and phred string + if ((($t[1]&0x10)>0) xor ($2<0)) { + $seq = reverse $seq; + $seq =~ tr/ACGTacgt/TGCAtgca/; + $phred = reverse $phred; + } + print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); + } + } else { print; } +}