diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba0430d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md index 9cecc1d..50cd16d 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,674 +1,21 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - {one line to give the program's name and a brief idea of what it does.} - Copyright (C) {year} {name of author} - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - {project} Copyright (C) {year} {fullname} - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. +MIT License + +Copyright (c) 2024 John Doty + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 80c7dec..b449b8e 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,126 @@ # A collection of LR parser generators, from LR0 through LALR. -One day I read a tweet, asking for a tool which accepted a grammar and an -input file and which then produced simple parsed output, without any kind of -in-between. (There was other ranty stuff about how none of the existing tools -really worked, but that was beside the point.) +This is a small helper library to generate LR parser tables. -Upon reading the tweet, it occured to me that I didn't know how LR parsers -worked and how they were generated, except in the broadest of terms. Thus, I -set about writing this, learning as I went. +The primary inspiration for this library is tree-sitter, which also generates +LR parsers for grammars written in a turing-complete language. Like that, we +write grammars in a language, only we do it in Python instead of JavaScript. -This code is not written to be fast, or even efficient, although it runs its -test cases fast enough. It was instead written to be easy to follow along -with, so that when I forget how all this works I can come back to the code -and read along and learn all over again. +Why Python? Because Python 3 is widely pre-installed on MacOS and Linux. This +library requires nothing more than the basic standard library, and not even a +new version of it. Therefore, it turns out to be a pretty light dependency for +a rust or C++ or something kind of project. (Tree-sitter, on the other hand, +requires node, which is a far less stable and available runtime in 2024.) + +The parser tables can really be used to power anything. I prefer to make +concrete syntax trees (again, see tree-sitter), and there is no facility at all +for actions or custom ASTs or whatnot. Any such processing needs to be done by +the thing that processes the tables. + +## Making Grammars + +To get started, create a grammar that derives from the `Grammar` class. Create +one method per nonterminal, decorated with the `rule` decorator. Here's an +example: + + PLUS = Token('+') + LPAREN = Token('(') + RPAREN = Token(')') + ID = Token('id') + + class SimpleGrammar(Grammar): + @rule + def expression(self): + return seq(self.expression, PLUS, self.term) | self.term + + @rule + def term(self): + return seq(LPAREN, self.expression, RPAREN) | ID + + +## Using grammars + +TODO + +## Representation Choices + +The SimpleGrammar class might seem a little verbose compared to a dense +structure like: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + +or + + grammar_simple = { + 'E': [ + ['E', '+', 'T'], + ['T'], + ], + 'T': [ + ['(', 'E', ')'], + ['id'], + ], + } + + +The advantage that the class has over a table like this is that you get to have +all of your Python tools help you make sure your grammar is good, if you want +them. e.g., if you're working with an LSP or something, the members give you +autocomplete and jump-to-definition and possibly even type-checking. + +At the very least, if you mis-type the name of a nonterminal, or forget to +implement it, we will immediately raise an error that *INCLUDES THE LOCATION IN +THE SOURCE WHERE THE ERROR WAS MADE.* With tables, we can tell you that you +made a mistake but it's up to you to figure out where you did it. + +### Aside: What about a custom DSL/EBNF like thing? + +Yeah, OK, there's a rich history of writing your grammar in a domain-specific +language. YACC did it, ANTLR does it, GRMTools.... just about everybody except +Tree-Sitter does this. + +But look, I've got several reasons for not doing it. + +First, I'm lazy, and don't want to write yet another parser for my parser. What +tools should I use to write my parser generator parser? I guess I don't have my +parser generator parser yet, so probably a hand-written top down parser? Some +other python parser generator? Ugh! + +As an add-on to that, if I make my own format then I need to make tooling for +*that* too: syntax highlighters, jump to definition, the works. Yuck. An +existing language, and a format that builds on an existing language, gets me the +tooling that comes along with that language. If you can leverage that +effictively (and I think I have) then you start way ahead in terms of tooling. + +Second, this whole thing is supposed to be easy to include in an existing +project, and adding a custom compiler doesn't seem to be that. Adding two python +files seems to be about the right speed. + +Thirdly, and this is just hypothetical, it's probably pretty easy to write your +own tooling around a grammar if it's already in Python. If you want to make +railroad diagrams or EBNF pictures or whatever, all the productions are already +right there in data structures for you to process. I've tried to keep them +accessible and at least somewhat easy to work with. There's nothing that says a +DSL-based system *has* to produce unusable intermediate data- certainly there +are some tools that *try*- but with this approach the accessibility and the +ergonomics of the tool go hand in hand. + +## Some History + +The first version of this code was written as an idle exercise to learn how LR +parser table generation even worked. It was... very simple, fairly easy to +follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow +for anything but the most trivial grammar. + +As a result, when I decided I wanted to use it for a larger grammar, I found that +I just couldn't. So this has been hacked and significantly improved from that +version, now capable of building tables for nontrivial grammars. It could still +be a lot faster, but it meets my needs for now. (BTW, the notes I read to learn how all this works are at http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, @@ -20,7 +128,5 @@ I started with handout 8, 'Bottom-up-parsing', and went from there. (I did eventually have to backtrack a little into handout 7, since that's where First() and Follow() are covered.) -Enjoy! - doty -2016-12-09 +May 2024 diff --git a/grammar.py b/grammar.py index bdefa43..c37405f 100644 --- a/grammar.py +++ b/grammar.py @@ -1,90 +1,5 @@ -import parser_faster -import sys -import typing - -from parser_faster import Assoc - -class Token: - value: str - - def __init__(self, value): - self.value = sys.intern(value) - -Symbol = Token | str - -def desugar( - grammar: dict[str, list[list[Symbol]]], - precedence: list[typing.Tuple[Assoc, list[Symbol]]], -): - nonterminal_refs = set() - nonterminals = set() - terminals = set() - - result: list[typing.Tuple[str, list[str]]] = [] - for (k, v) in grammar.items(): - nonterminals.add(k) - - for rule in v: - assert isinstance(rule, list) - result_rule: list[str] = [] - for symbol in rule: - if isinstance(symbol, Token): - result_rule.append(symbol.value) - terminals.add(symbol.value) - else: - result_rule.append(symbol) - nonterminal_refs.add(symbol) - - result.append((k, result_rule)) - - unknown_rules = nonterminal_refs - nonterminals - if len(unknown_rules) > 0: - undefined = "\n ".join(unknown_rules) - raise Exception(f"The following rules are not defined:\n {undefined}") - - overlap_rules = nonterminals & terminals - if len(overlap_rules) > 0: - overlap = "\n ".join(overlap_rules) - raise Exception(f"The following symbols are both tokens and rules:\n {overlap}") - - result_precedence = { - (symbol.value if isinstance(symbol, Token) else symbol):(associativity, precedence + 1) - for precedence, (associativity, symbols) in enumerate(precedence) - for symbol in symbols - } - - return result, result_precedence - -def dump_yacc(grammar): - tokens = set() - for rules in grammar.values(): - for rule in rules: - for symbol in rule: - if symbol.startswith("token:"): - symbol = symbol[6:].upper() - tokens.add(symbol) - for token in sorted(tokens): - print(f"%token {token}") - - print() - print("%%") - - for name, rules in grammar.items(): - print(f"{name} : ", end=''); - for i,rule in enumerate(rules): - if i != 0: - print(f"{' ' * len(name)} | ", end='') - - parts = [] - for symbol in rule: - if symbol.startswith("token:"): - symbol = symbol[6:].upper() - parts.append(symbol) - print(' '.join(parts)) - print() - - print("%%") - +# This is an example grammar. +from parser import Assoc, Grammar, Nothing, Token, rule, seq ARROW = Token("Arrow") AS = Token("As") @@ -136,290 +51,339 @@ LSQUARE = Token("LeftBracket") RSQUARE = Token("RightBracket") -# fmt: off -precedence = [ - (Assoc.RIGHT, [EQUAL]), - (Assoc.LEFT, [OR]), - (Assoc.LEFT, [IS]), - (Assoc.LEFT, [AND]), - (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), - (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), - (Assoc.LEFT, [PLUS, MINUS]), - (Assoc.LEFT, [STAR, SLASH]), - (Assoc.LEFT, ["PrimaryExpression"]), - (Assoc.LEFT, [LPAREN]), - (Assoc.LEFT, [DOT]), +class FineGrammar(Grammar): + def __init__(self): + super().__init__( + precedence=[ + (Assoc.RIGHT, [EQUAL]), + (Assoc.LEFT, [OR]), + (Assoc.LEFT, [IS]), + (Assoc.LEFT, [AND]), + (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), + (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), + (Assoc.LEFT, [self.primary_expression]), + (Assoc.LEFT, [LPAREN]), + (Assoc.LEFT, [DOT]), + # + # If there's a confusion about whether to make an IF + # statement or an expression, prefer the statement. + # + (Assoc.NONE, [self.if_statement]), + ] + ) - # If there's a confusion about whether to make an IF statement or an - # expression, prefer the statement. - (Assoc.NONE, ["IfStatement"]), -] + @rule + def file(self): + return self.file_statement_list -grammar = { - "File": [ - ["FileStatementList"], - ], - "FileStatementList": [ - ["FileStatement"], - ["FileStatement", "FileStatementList"], - ], - "FileStatement": [ - ["ImportStatement"], - ["ClassDeclaration"], - ["ExportStatement"], - ["Statement"], - ], + @rule + def file_statement_list(self): + return self.file_statement | (self.file_statement_list + self.file_statement) - "ImportStatement": [ - [IMPORT, STRING, AS, IDENTIFIER, SEMICOLON], - ], + @rule + def file_statement(self): + return ( + self.import_statement | self.class_declaration | self.export_statement | self.statement + ) - # Classes - "ClassDeclaration": [ - [CLASS, IDENTIFIER, "ClassBody"], - ], - "ClassBody": [ - [LCURLY, RCURLY], - [LCURLY, "ClassMembers", RCURLY], - ], - "ClassMembers": [ - ["ClassMember"], - ["ClassMembers", "ClassMember"], - ], - "ClassMember": [ - ["FieldDeclaration"], - ["FunctionDeclaration"], - ], - "FieldDeclaration": [ - [IDENTIFIER, COLON, "TypeExpression", SEMICOLON], - ], + @rule + def import_statement(self): + return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) + + @rule + def class_declaration(self): + return seq(CLASS, IDENTIFIER, self.class_body) + + @rule + def class_body(self): + return seq(LCURLY, RCURLY) | seq(LCURLY, self.class_members, RCURLY) + + @rule + def class_members(self): + return self.class_member | seq(self.class_members, self.class_member) + + @rule + def class_member(self): + return self.field_declaration | self.function_declaration + + @rule + def field_declaration(self): + return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) # Types - "TypeExpression": [ - ["AlternateType"], - ["TypeIdentifier"], - ], - "AlternateType": [ - ["TypeExpression", BAR, "TypeIdentifier"], - ], - "TypeIdentifier": [ - [IDENTIFIER], - ], + @rule + def type_expression(self): + return self.alternate_type | self.type_identifier - "ExportStatement": [ - [EXPORT, "ClassDeclaration"], - [EXPORT, "FunctionDeclaration"], - [EXPORT, "LetStatement"], - [EXPORT, "ExportList", SEMICOLON], - ], - "ExportList": [ - [], - [IDENTIFIER], - [IDENTIFIER, COMMA, "ExportList"], - ], + @rule + def alternate_type(self): + return seq(self.type_expression, BAR, self.type_identifier) + + @rule + def type_identifier(self): + return IDENTIFIER + + @rule + def export_statement(self): + return ( + seq(EXPORT, self.class_declaration) + | seq(EXPORT, self.function_declaration) + | seq(EXPORT, self.let_statement) + | seq(EXPORT, self.export_list, SEMICOLON) + ) + + @rule + def export_list(self): + return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) # Functions - "FunctionDeclaration": [ - [FUN, IDENTIFIER, "FunctionParameters", "Block"], - [FUN, IDENTIFIER, "FunctionParameters", ARROW, "TypeExpression", "Block"], - ], - "FunctionParameters": [ - [LPAREN, RPAREN], - [LPAREN, "FirstParameter", RPAREN], - [LPAREN, "FirstParameter", COMMA, "ParameterList", RPAREN], - ], - "FirstParameter": [ - [SELF], - ["Parameter"], - ], - "ParameterList": [ - [], - ["Parameter"], - ["Parameter", COMMA, "ParameterList"], - ], - "Parameter": [ - [IDENTIFIER, COLON, "TypeExpression"], - ], + @rule + def function_declaration(self): + return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( + FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block + ) + + @rule + def function_parameters(self): + return ( + seq(LPAREN, RPAREN) + | seq(LPAREN, self.first_parameter, RPAREN) + | seq(LPAREN, self.first_parameter, COMMA, self.parameter_list, RPAREN) + ) + + @rule + def first_parameter(self): + return SELF | self.parameter + + @rule + def parameter_list(self): + return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list) + + @rule + def parameter(self): + return seq(IDENTIFIER, COLON, self.type_expression) # Block - "Block": [ - [LCURLY, RCURLY], - [LCURLY, "StatementList", RCURLY], - [LCURLY, "StatementList", "Expression", RCURLY], - ], - "StatementList": [ - ["Statement"], - ["StatementList", "Statement"], - ], + @rule + def block(self): + return ( + seq(LCURLY, RCURLY) + | seq(LCURLY, self.statement_list, RCURLY) + | seq(LCURLY, self.statement_list, self.expression, RCURLY) + ) - "Statement": [ - ["FunctionDeclaration"], - ["LetStatement"], - ["ReturnStatement"], - ["ForStatement"], - ["IfStatement"], - ["WhileStatement"], - ["ExpressionStatement"], - ], + @rule + def statement_list(self): + return self.statement | seq(self.statement_list, self.statement) - "LetStatement": [ - [LET, IDENTIFIER, EQUAL, "Expression", SEMICOLON], - ], + @rule + def statement(self): + return ( + self.function_declaration + | self.let_statement + | self.return_statement + | self.for_statement + | self.if_statement + | self.while_statement + | self.expression_statement + ) - "ReturnStatement": [ - [RETURN, "Expression", SEMICOLON], - ], + @rule + def let_statement(self): + return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) - "ForStatement": [ - [FOR, "IteratorVariable", IN, "Expression", "Block"], - ], - "IteratorVariable": [[IDENTIFIER]], + @rule + def return_statement(self): + return seq(RETURN, self.expression, SEMICOLON) - "IfStatement": [["ConditionalExpression"]], + @rule + def for_statement(self): + return seq(FOR, self.iterator_variable, IN, self.expression, self.block) - "WhileStatement": [ - [WHILE, "Expression", "Block"], - ], + @rule + def iterator_variable(self): + return IDENTIFIER - "ExpressionStatement": [ - ["Expression", SEMICOLON], - ], + @rule + def if_statement(self): + return self.conditional_expression + + @rule + def while_statement(self): + return seq(WHILE, self.expression, self.block) + + @rule + def expression_statement(self): + return seq(self.expression, SEMICOLON) # Expressions - "Expression": [["AssignmentExpression"]], + @rule + def expression(self): + return self.assignment_expression - "AssignmentExpression": [ - ["OrExpression", EQUAL, "AssignmentExpression"], - ["OrExpression"], - ], - "OrExpression": [ - ["OrExpression", OR, "IsExpression"], - ["IsExpression"], - ], - "IsExpression": [ - ["IsExpression", IS, "Pattern"], - ["AndExpression"], - ], - "AndExpression": [ - ["AndExpression", AND, "EqualityExpression"], - ["EqualityExpression"], - ], - "EqualityExpression": [ - ["EqualityExpression", EQUALEQUAL, "RelationExpression"], - ["EqualityExpression", BANGEQUAL, "RelationExpression"], - ["RelationExpression"], - ], - "RelationExpression": [ - ["RelationExpression", LESS, "AdditiveExpression"], - ["RelationExpression", LESSEQUAL, "AdditiveExpression"], - ["RelationExpression", GREATER, "AdditiveExpression"], - ["RelationExpression", GREATEREQUAL, "AdditiveExpression"], - ["AdditiveExpression"], - ], - "AdditiveExpression": [ - ["AdditiveExpression", PLUS, "MultiplicationExpression"], - ["AdditiveExpression", MINUS, "MultiplicationExpression"], - ["MultiplicationExpression"], - ], - "MultiplicationExpression": [ - ["MultiplicationExpression", STAR, "PrimaryExpression"], - ["MultiplicationExpression", SLASH, "PrimaryExpression"], - ["PrimaryExpression"], - ], - "PrimaryExpression": [ - [IDENTIFIER], - [SELF], - [NUMBER], - [STRING], - [TRUE], - [FALSE], - [BANG, "PrimaryExpression"], - [MINUS, "PrimaryExpression"], + @rule + def assignment_expression(self): + return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression - ["Block"], - ["ConditionalExpression"], - ["ListConstructorExpression"], - ["ObjectConstructorExpression"], - ["MatchExpression"], + @rule + def or_expression(self): + return seq(self.or_expression, OR, self.is_expression) | self.is_expression - ["PrimaryExpression", LPAREN, "ExpressionList", RPAREN], - ["PrimaryExpression", DOT, IDENTIFIER], + @rule + def is_expression(self): + return seq(self.is_expression, IS, self.pattern) | self.and_expression - [LPAREN, "Expression", RPAREN], - ], + @rule + def and_expression(self): + return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression - "ConditionalExpression": [ - [IF, "Expression", "Block"], - [IF, "Expression", "Block", ELSE, "ConditionalExpression"], - [IF, "Expression", "Block", ELSE, "Block"], - ], + @rule + def equality_expression(self): + return ( + seq(self.equality_expression, EQUALEQUAL, self.relation_expression) + | seq(self.equality_expression, BANGEQUAL, self.relation_expression) + | self.relation_expression + ) - "ListConstructorExpression": [ - [LSQUARE, RSQUARE], - [LSQUARE, "ExpressionList", RSQUARE], - ], + @rule + def relation_expression(self): + return ( + seq(self.relation_expression, LESS, self.additive_expression) + | seq(self.relation_expression, LESSEQUAL, self.additive_expression) + | seq(self.relation_expression, GREATER, self.additive_expression) + | seq(self.relation_expression, GREATEREQUAL, self.additive_expression) + ) - "ExpressionList": [ - ["Expression"], - ["Expression", COMMA], - ["Expression", COMMA, "ExpressionList"], - ], + @rule + def additive_expression(self): + return ( + seq(self.additive_expression, PLUS, self.multiplication_expression) + | seq(self.additive_expression, MINUS, self.multiplication_expression) + | self.multiplication_expression + ) - # Match Expression - "MatchExpression": [ - [MATCH, "MatchBody"], - ], - "MatchBody": [ - [LCURLY, RCURLY], - [LCURLY, "MatchArms", RCURLY], - ], - "MatchArms": [ - ["MatchArm"], - ["MatchArm", COMMA], - ["MatchArm", COMMA, "MatchArms"], - ], - "MatchArm": [ - ["Pattern", ARROW, "Expression"], - ], + @rule + def multiplication_expression(self): + return ( + seq(self.multiplication_expression, STAR, self.primary_expression) + | seq(self.multiplication_expression, SLASH, self.primary_expression) + | self.primary_expression + ) - # Pattern - "Pattern": [ - ["VariableBinding", "PatternCore", AND, "AndExpression"], - ["VariableBinding", "PatternCore"], - ["PatternCore", AND, "AndExpression"], - ["PatternCore"], - ], - "PatternCore": [ - ["TypeExpression"], - ["WildcardPattern"], - ], - "WildcardPattern": [[UNDERSCORE]], - "VariableBinding": [[IDENTIFIER, COLON]], + @rule + def primary_expression(self): + return ( + IDENTIFIER + | SELF + | NUMBER + | STRING + | TRUE + | FALSE + | seq(BANG, self.primary_expression) + | seq(MINUS, self.primary_expression) + | self.block + | self.conditional_expression + | self.list_constructor_expression + | self.object_constructor_expression + | self.match_expression + | seq(self.primary_expression, LPAREN, self.expression_list, RPAREN) + | seq(self.primary_expression, DOT, IDENTIFIER) + | seq(LPAREN, self.expression, RPAREN) + ) - # Object Constructor - "ObjectConstructorExpression": [ - [NEW, "TypeIdentifier", "FieldList"], - ], - "FieldList": [ - [LCURLY, RCURLY], - [LCURLY, "FieldValues", RCURLY], - ], - "FieldValues": [ - ["FieldValue"], - ["FieldValue", COMMA], - ["FieldValue", COMMA, "FieldValues"], - ], - "FieldValue": [ - [IDENTIFIER], - [IDENTIFIER, COLON, "Expression"], - ], -} -# fmt: on + @rule + def conditional_expression(self): + return ( + seq(IF, self.expression, self.block) + | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) + | seq(IF, self.expression, self.block, ELSE, self.block) + ) + + @rule + def list_constructor_expression(self): + return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE) + + @rule + def expression_list(self): + return ( + self.expression + | seq(self.expression, COMMA) + | seq(self.expression, COMMA, self.expression_list) + ) + + @rule + def match_expression(self): + return seq(MATCH, self.match_body) + + @rule + def match_body(self): + return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY) + + @rule + def match_arms(self): + return ( + self.match_arm + | seq(self.match_arm, COMMA) + | seq(self.match_arm, COMMA, self.match_arms) + ) + + @rule + def match_arm(self): + return seq(self.pattern, ARROW, self.expression) + + @rule + def pattern(self): + return ( + seq(self.variable_binding, self.pattern_core, AND, self.and_expression) + | seq(self.variable_binding, self.pattern_core) + | seq(self.pattern_core, AND, self.and_expression) + | self.pattern_core + ) + + @rule + def pattern_core(self): + return self.type_expression | self.wildcard_pattern + + @rule + def wildcard_pattern(self): + return UNDERSCORE + + @rule + def variable_binding(self): + return seq(IDENTIFIER, COLON) + + @rule + def object_constructor_expression(self): + return seq(NEW, self.type_identifier, self.field_list) + + @rule + def field_list(self): + return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) + + @rule + def field_values(self): + return ( + self.field_value + | seq(self.field_value, COMMA) + | seq(self.field_value, COMMA, self.field_values) + ) + + @rule + def field_value(self): + return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) + + +grammar = FineGrammar() +table = grammar.build_table(start="file") + +print(f"{len(table)} states") + +average_entries = sum(len(row) for row in table) / len(table) +max_entries = max(len(row) for row in table) +print(f"{average_entries} average, {max_entries} max") -# dump_yacc(grammar) -grammar, precedence = desugar(grammar, precedence) -gen = parser_faster.GenerateLR1("File", grammar, precedence=precedence) -gen.gen_table() # print(parser_faster.format_table(gen, table)) # print() # tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) diff --git a/historical/parser.py b/historical/parser.py new file mode 100644 index 0000000..17101bd --- /dev/null +++ b/historical/parser.py @@ -0,0 +1,853 @@ +"""A collection of LR parser generators, from LR0 through LALR. + +One day I read a tweet, asking for a tool which accepted a grammar and an +input file and which then produced simple parsed output, without any kind of +in-between. (There was other ranty stuff about how none of the existing tools +really worked, but that was beside the point.) + +Upon reading the tweet, it occured to me that I didn't know how LR parsers +worked and how they were generated, except in the broadest of terms. Thus, I +set about writing this, learning as I went. + +This code is not written to be fast, or even efficient, although it runs its +test cases fast enough. It was instead written to be easy to follow along +with, so that when I forget how all this works I can come back to the code +and read along and learn all over again. + +(BTW, the notes I read to learn how all this works are at +http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, +I started with handout 8, 'Bottom-up-parsing', and went from there. (I did +eventually have to backtrack a little into handout 7, since that's where +First() and Follow() are covered.) + +Enjoy! + +doty +2016-12-09 +""" + +from collections import namedtuple + + +############################################################################### +# LR0 +# +# We start with LR0 parsers, because they form the basis of everything else. +############################################################################### +class Configuration(namedtuple("Configuration", ["name", "symbols", "position", "lookahead"])): + """A rule being tracked in a state. + + (Note: technically, lookahead isn't used until we get to LR(1) parsers, + but if left at its default it's harmless. Ignore it until you get to + the part about LR(1).) + """ + + __slots__ = () + + @classmethod + def from_rule(cls, rule, lookahead=()): + return Configuration( + name=rule[0], + symbols=rule[1], + position=0, + lookahead=lookahead, + ) + + @property + def at_end(self): + return self.position == len(self.symbols) + + @property + def next(self): + return self.symbols[self.position] if not self.at_end else None + + @property + def rest(self): + return self.symbols[(self.position + 1) :] + + def at_symbol(self, symbol): + return self.next == symbol + + def replace(self, **kwargs): + return self._replace(**kwargs) + + def __str__(self): + la = ", " + str(self.lookahead) if self.lookahead != () else "" + return "{name} -> {bits}{lookahead}".format( + name=self.name, + bits=" ".join( + ["* " + sym if i == self.position else sym for i, sym in enumerate(self.symbols)] + ) + + (" *" if self.at_end else ""), + lookahead=la, + ) + + +class GenerateLR0(object): + """Generate parser tables for an LR0 parser. + + The input grammars are of the form: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + + Which is to say, they are a list of productions. Each production is a + tuple where the first element of the tuple is the name of the + non-terminal being added, and the second elment of the tuple is the + list of terminals and non-terminals that make up the production. + + There is currently no support for custom actions or alternation or + anything like that. If you want alternations that you'll have to lower + the grammar by hand into the simpler form first. + + Don't name anything with double-underscores; those are reserved for + the generator. Don't add '$' either, as it is reserved to mean + end-of-stream. Use an empty list to indicate nullability, that is: + + ('O', []), + + means that O can be matched with nothing. + + Implementation notes: + - This is implemented in the dumbest way possible, in order to be the + most understandable it can be. I built this to learn, and I want to + make sure I can keep learning with it. + + - We tend to use tuples everywhere. This is because tuples can be + compared for equality and put into tables and all that jazz. They might + be a little bit slower in places but like I said, this is for + learning. (Also, if we need this to run faster we can probably go a + long way by memoizing results, which is much easier if we have tuples + everywhere.) + """ + + def __init__(self, start, grammar): + """Initialize the parser generator with the specified grammar and + start symbol. + """ + # We always store the "augmented" grammar, which contains an initial + # production for the start state. grammar[0] is always the start + # rule, and in the set of states and table and whatever the first + # element is always the starting state/position. + self.grammar = [("__start", [start])] + grammar + self.nonterminals = {rule[0] for rule in grammar} + self.terminals = { + sym for name, symbols in grammar for sym in symbols if sym not in self.nonterminals + } + self.alphabet = self.terminals | self.nonterminals + + # Check to make sure they didn't use anything that will give us + # heartburn later. + reserved = [a for a in self.alphabet if a.startswith("__") or a == "$"] + if reserved: + raise ValueError( + "Can't use {symbols} in grammars, {what} reserved.".format( + symbols=" or ".join(reserved), + what="it's" if len(reserved) == 1 else "they're", + ) + ) + + self.terminals.add("$") + self.alphabet.add("$") + + def gen_closure_next(self, config): + """Return the next set of configurations in the closure for + config. + + If the position for config is just before a non-terminal, then the + next set of configurations is configurations for all of the + productions for that non-terminal, with the position at the + beginning. (If the position for config is just before a terminal, + or at the end of the production, then the next set is empty.) + """ + if config.at_end: + return () + else: + return tuple( + Configuration.from_rule(rule) for rule in self.grammar if rule[0] == config.next + ) + + def gen_closure(self, config, closure): + """Compute the closure for the specified config and unify it with the + existing closure. + + If the provided config is already in the closure then nothing is + done. (We assume that the closure of the config is *also* already in + the closure.) + """ + if config in closure: + return closure + else: + new_closure = tuple(closure) + (config,) + for next_config in self.gen_closure_next(config): + new_closure = self.gen_closure(next_config, new_closure) + return new_closure + + def gen_successor(self, config_set, symbol): + """Compute the successor state for the given config set and the + given symbol. + + The successor represents the next state of the parser after seeing + the symbol. + """ + seeds = [ + config.replace(position=config.position + 1) + for config in config_set + if config.at_symbol(symbol) + ] + + closure = () + for seed in seeds: + closure = self.gen_closure(seed, closure) + + return closure + + def gen_all_successors(self, config_set): + """Return all of the non-empty successors for the given config set.""" + next = [] + for symbol in self.alphabet: + successor = self.gen_successor(config_set, symbol) + if len(successor) > 0: + next.append(successor) + + return tuple(next) + + def gen_sets(self, config_set, F): + """Recursively generate all configuration sets starting from the + provided set, and merge them with the provided set 'F'. + """ + if config_set in F: + return F + else: + new_F = F + (config_set,) + for successor in self.gen_all_successors(config_set): + new_F = self.gen_sets(successor, new_F) + + return new_F + + def gen_all_sets(self): + """Generate all of the configuration sets for the grammar.""" + initial_set = self.gen_closure( + Configuration.from_rule(self.grammar[0]), + (), + ) + return self.gen_sets(initial_set, ()) + + def find_set_index(self, sets, set): + """Find the specified set in the set of sets, and return the + index, or None if it is not found. + """ + for i, s in enumerate(sets): + if s == set: + return i + return None + + def gen_reduce_set(self, config): + """Return the set of symbols that indicate we should reduce the given + configuration. + + In an LR0 parser, this is just the set of all terminals.""" + return self.terminals + + def gen_table(self): + """Generate the parse table. + + The parse table is a list of states. The first state in the list is + the starting state. Each state is a dictionary that maps a symbol to an + action. Each action is a tuple. The first element of the tuple is a + string describing what to do: + + - 'shift': The second element of the tuple is the state + number. Consume the input and push that state onto the stack. + + - 'reduce': The second element is the name of the non-terminal being + reduced, and the third element is the number of states to remove + from the stack. Don't consume the input; just remove the specified + number of things from the stack, and then consult the table again, + this time using the new top-of-stack as the current state and the + name of the non-terminal to find out what to do. + + - 'goto': The second element is the state number to push onto the + stack. In the literature, these entries are treated distinctly from + the actions, but we mix them here because they never overlap with the + other actions. (These are always associated with non-terminals, and + the other actions are always associated with terminals.) + + - 'accept': Accept the result of the parse, it worked. + + Anything missing from the row indicates an error. + """ + action_table = [] + config_sets = self.gen_all_sets() + for config_set in config_sets: + actions = {} + + # Actions + for config in config_set: + if config.at_end: + if config.name != "__start": + for a in self.gen_reduce_set(config): + self.set_table_action( + actions, + a, + ("reduce", config.name, len(config.symbols)), + config, + ) + else: + self.set_table_action( + actions, + "$", + ("accept",), + config, + ) + + else: + if config.next in self.terminals: + successor = self.gen_successor(config_set, config.next) + index = self.find_set_index(config_sets, successor) + self.set_table_action( + actions, + config.next, + ("shift", index), + config, + ) + + # Gotos + for symbol in self.nonterminals: + successor = self.gen_successor(config_set, symbol) + index = self.find_set_index(config_sets, successor) + if index is not None: + self.set_table_action( + actions, + symbol, + ("goto", index), + None, + ) + + # set_table_action stores the configs that generated the actions in + # the table, for diagnostic purposes. This filters them out again + # so that the parser has something clean to work with. + actions = {k: self.get_table_action(actions, k) for k in actions} + action_table.append(actions) + + return action_table + + def set_table_action(self, row, symbol, action, config): + """Set the action for 'symbol' in the table row to 'action'. + + This is destructive; it changes the table. It raises an error if + there is already an action for the symbol in the row. + """ + existing, existing_config = row.get(symbol, (None, None)) + if existing is not None and existing != action: + config_old = str(existing_config) + config_new = str(config) + max_len = max(len(config_old), len(config_new)) + 1 + error = ( + "Conflicting actions for token '{symbol}':\n" + " {config_old: <{max_len}}: {old}\n" + " {config_new: <{max_len}}: {new}\n".format( + config_old=config_old, + config_new=config_new, + max_len=max_len, + old=existing, + new=action, + symbol=symbol, + ) + ) + raise ValueError(error) + row[symbol] = (action, config) + + def get_table_action(self, row, symbol): + return row[symbol][0] + + +def parse(table, input, trace=False): + """Parse the input with the generated parsing table and return the + concrete syntax tree. + + The parsing table can be generated by GenerateLR0.gen_table() or by any + of the other generators below. The parsing mechanism never changes, only + the table generation mechanism. + + input is a list of tokens. Don't stick an end-of-stream marker, I'll stick + one on for you. + """ + assert "$" not in input + input = input + ["$"] + input_index = 0 + + # Our stack is a stack of tuples, where the first entry is the state number + # and the second entry is the 'value' that was generated when the state was + # pushed. + stack = [(0, None)] + while True: + current_state = stack[-1][0] + current_token = input[input_index] + + action = table[current_state].get(current_token, ("error",)) + if trace: + print( + "{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index:]), + action=repr(action), + ) + ) + + if action[0] == "accept": + return stack[-1][1] + + elif action[0] == "reduce": + name = action[1] + size = action[2] + + value = (name, tuple(s[1] for s in stack[-size:])) + stack = stack[:-size] + + goto = table[stack[-1][0]].get(name, ("error",)) + assert goto[0] == "goto" # Corrupt table? + stack.append((goto[1], value)) + + elif action[0] == "shift": + stack.append((action[1], (current_token, ()))) + input_index += 1 + + elif action[0] == "error": + raise ValueError( + "Syntax error: unexpected symbol {sym}".format( + sym=current_token, + ), + ) + + +############################################################################### +# SLR(1) +############################################################################### +class GenerateSLR1(GenerateLR0): + """Generate parse tables for SLR1 grammars. + + SLR1 parsers can recognize more than LR0 parsers, because they have a + little bit more information: instead of generating reduce actions for a + production on all possible inputs, as LR0 parsers do, they generate + reduce actions only for inputs that are in the 'follow' set of the + non-terminal. + + That means SLR1 parsers need to know how to generate 'follow(A)', which + means they need to know how to generate 'first(A)', which is most of the + code in this class. + """ + + def gen_first_symbol(self, symbol, visited): + """Compute the first set for a single symbol. + + If a symbol can be empty, then the set contains epsilon, which we + represent as python's `None`. + + The first set is the set of tokens that can appear as the first token + for a given symbol. (Obviously, if the symbol is itself a token, then + this is trivial.) + + 'visited' is a set of already visited symbols, to stop infinite + recursion on left-recursive grammars. That means that sometimes this + function can return an empty tuple. Don't confuse that with a tuple + containing epsilon: that's a tuple containing `None`, not an empty + tuple. + """ + if symbol in self.terminals: + return (symbol,) + elif symbol in visited: + return () + else: + assert symbol in self.nonterminals + visited.add(symbol) + + # All the firsts from all the productions. + firsts = [ + self.gen_first(rule[1], visited) for rule in self.grammar if rule[0] == symbol + ] + + result = () + for fs in firsts: + result = result + tuple(f for f in fs if f not in result) + + return tuple(sorted(result)) + + def gen_first(self, symbols, visited=None): + """Compute the first set for a sequence of symbols. + + The first set is the set of tokens that can appear as the first token + for this sequence of symbols. The interesting wrinkle in computing the + first set for a sequence of symbols is that we keep computing the first + sets so long as epsilon appears in the set. i.e., if we are computing + for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the + first set for the *sequence* also contains the first set of ['B', 'C'], + since 'A' could be missing entirely. + + An epsilon in the result is indicated by 'None'. There will always be + at least one element in the result. + + The 'visited' parameter, if not None, is a set of symbols that are + already in the process of being evaluated, to deal with left-recursive + grammars. (See gen_first_symbol for more.) + """ + if len(symbols) == 0: + return (None,) # Epsilon. + else: + if visited is None: + visited = set() + result = self.gen_first_symbol(symbols[0], visited) + if None in result: + result = tuple(s for s in result if s is not None) + result = result + self.gen_first(symbols[1:], visited) + result = tuple(sorted(set(result))) + return result + + def gen_follow(self, symbol, visited=None): + """Generate the follow set for the given nonterminal. + + The follow set for a nonterminal is the set of terminals that can + follow the nonterminal in a valid sentence. The resulting set never + contains epsilon and is never empty, since we should always at least + ground out at '$', which is the end-of-stream marker. + """ + if symbol == "__start": + return tuple("$") + + assert symbol in self.nonterminals + + # Deal with left-recursion. + if visited is None: + visited = set() + if symbol in visited: + return () + visited.add(symbol) + + follow = () + for production in self.grammar: + for index, prod_symbol in enumerate(production[1]): + if prod_symbol != symbol: + continue + + first = self.gen_first(production[1][index + 1 :]) + follow = follow + tuple(f for f in first if f is not None) + if None in first: + follow = follow + self.gen_follow(production[0], visited) + + assert None not in follow # Should always ground out at __start + return follow + + def gen_reduce_set(self, config): + """Return the set of symbols that indicate we should reduce the given + config. + + In an SLR1 parser, this is the follow set of the config nonterminal.""" + return self.gen_follow(config.name) + + +class GenerateLR1(GenerateSLR1): + """Generate parse tables for LR1, or "canonical LR" grammars. + + LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they + are choosier about when they reduce. But unlike SLR parsers, they specify + the terminals on which they reduce by carrying a 'lookahead' terminal in + the configuration. The lookahead of a configuration is computed as the + closure of a configuration set is computed, so see gen_closure_next for + details. (Except for the start configuration, which has '$' as its + lookahead.) + """ + + def gen_reduce_set(self, config): + """Return the set of symbols that indicate we should reduce the given + config. + + In an LR1 parser, this is the lookahead of the configuration.""" + return config.lookahead + + def gen_closure_next(self, config): + """Return the next set of configurations in the closure for + config. + + In LR1 parsers, we must compute the lookahead for the configurations + we're adding to the closure. The lookahead for the new configurations + is the first() of the rest of this config's production. If that + contains epsilon, then the lookahead *also* contains the lookahead we + already have. (This lookahead was presumably generated by the same + process, so in some sense it is a 'parent' lookahead, or a lookahead + from an upstream production in the grammar.) + + (See the documentation in GenerateLR0 for more information on how + this function fits into the whole process.) + """ + if config.at_end: + return () + else: + next = [] + for rule in self.grammar: + if rule[0] != config.next: + continue + + # N.B.: We can't just append config.lookahead to config.rest + # and compute first(), because lookahead is a *set*. So + # in this case we just say if 'first' contains epsilon, + # then we need to remove the epsilon and union with the + # existing lookahead. + lookahead = self.gen_first(config.rest) + if None in lookahead: + lookahead = tuple(l for l in lookahead if l is not None) + lookahead = lookahead + config.lookahead + lookahead = tuple(sorted(set(lookahead))) + next.append(Configuration.from_rule(rule, lookahead=lookahead)) + + return tuple(next) + + def gen_all_sets(self): + """Generate all of the configuration sets for the grammar. + + In LR1 parsers, we must remember to set the lookahead of the start + symbol to '$'. + """ + initial_set = self.gen_closure( + Configuration.from_rule(self.grammar[0], lookahead=("$",)), + (), + ) + return self.gen_sets(initial_set, ()) + + +class GenerateLALR(GenerateLR1): + """Generate tables for LALR. + + LALR is smaller than LR(1) but bigger than SLR(1). It works by generating + the LR(1) configuration sets, but merging configuration sets which are + equal in everything but their lookaheads. This works in that it doesn't + generate any shift/reduce conflicts that weren't already in the LR(1) + grammar. It can, however, introduce new reduce/reduce conflicts, because + it does lose information. The advantage is that the number of parser + states is much much smaller in LALR than in LR(1). + + (Note that because we use immutable state everywhere this generator does + a lot of copying and allocation.) + """ + + def merge_sets(self, config_set_a, config_set_b): + """Merge the two config sets, by keeping the item cores but merging + the lookahead sets for each item. + """ + assert len(config_set_a) == len(config_set_b) + merged = [] + for index, a in enumerate(config_set_a): + b = config_set_b[index] + assert a.replace(lookahead=()) == b.replace(lookahead=()) + + new_lookahead = a.lookahead + b.lookahead + new_lookahead = tuple(sorted(set(new_lookahead))) + merged.append(a.replace(lookahead=new_lookahead)) + + return tuple(merged) + + def sets_equal(self, a, b): + a_no_la = tuple(s.replace(lookahead=()) for s in a) + b_no_la = tuple(s.replace(lookahead=()) for s in b) + return a_no_la == b_no_la + + def gen_sets(self, config_set, F): + """Recursively generate all configuration sets starting from the + provided set, and merge them with the provided set 'F'. + + The difference between this method and the one in GenerateLR0, where + this comes from, is in the part that stops recursion. In LALR we + compare for set equality *ignoring lookahead*. If we find a match, + then instead of returning F unchanged, we merge the two equal sets + and replace the set in F, returning the modified set. + """ + config_set_no_la = tuple(s.replace(lookahead=()) for s in config_set) + for index, existing in enumerate(F): + existing_no_la = tuple(s.replace(lookahead=()) for s in existing) + if config_set_no_la == existing_no_la: + merged_set = self.merge_sets(config_set, existing) + return F[:index] + (merged_set,) + F[index + 1 :] + + # No merge candidate found, proceed. + new_F = F + (config_set,) + for successor in self.gen_all_successors(config_set): + new_F = self.gen_sets(successor, new_F) + + return new_F + + def find_set_index(self, sets, set): + """Find the specified set in the set of sets, and return the + index, or None if it is not found. + """ + for i, s in enumerate(sets): + if self.sets_equal(s, set): + return i + return None + + +############################################################################### +# Formatting +############################################################################### +def format_node(node): + """Print out an indented concrete syntax tree, from parse().""" + lines = ["{name}".format(name=node[0])] + [ + " " + line for child in node[1] for line in format_node(child).split("\n") + ] + return "\n".join(lines) + + +def format_table(generator, table): + """Format a parser table so pretty.""" + + def format_action(state, terminal): + action = state.get(terminal, ("error",)) + if action[0] == "accept": + return "accept" + elif action[0] == "shift": + return "s" + str(action[1]) + elif action[0] == "error": + return "" + elif action[0] == "reduce": + return "r" + str(action[1]) + + header = " | {terms} | {nts}".format( + terms=" ".join("{0: <6}".format(terminal) for terminal in sorted(generator.terminals)), + nts=" ".join("{0: <5}".format(nt) for nt in sorted(generator.nonterminals)), + ) + + lines = [ + header, + "-" * len(header), + ] + [ + "{index: <3} | {actions} | {gotos}".format( + index=i, + actions=" ".join( + "{0: <6}".format(format_action(row, terminal)) + for terminal in sorted(generator.terminals) + ), + gotos=" ".join( + "{0: <5}".format(row.get(nt, ("error", ""))[1]) + for nt in sorted(generator.nonterminals) + ), + ) + for i, row in enumerate(table) + ] + return "\n".join(lines) + + +############################################################################### +# Examples +############################################################################### +# OK, this is a very simple LR0 grammar. +grammar_simple = [ + ("E", ["E", "+", "T"]), + ("E", ["T"]), + ("T", ["(", "E", ")"]), + ("T", ["id"]), +] + +gen = GenerateLR0("E", grammar_simple) +table = gen.gen_table() +tree = parse(table, ["id", "+", "(", "id", ")"]) +print(format_node(tree) + "\n") +print() + +# This one doesn't work with LR0, though, it has a shift/reduce conflict. +grammar_lr0_shift_reduce = grammar_simple + [ + ("T", ["id", "[", "E", "]"]), +] +try: + gen = GenerateLR0("E", grammar_lr0_shift_reduce) + table = gen.gen_table() + assert False +except ValueError as e: + print(e) +print() + +# Nor does this: it has a reduce/reduce conflict. +grammar_lr0_reduce_reduce = grammar_simple + [ + ("E", ["V", "=", "E"]), + ("V", ["id"]), +] +try: + gen = GenerateLR0("E", grammar_lr0_reduce_reduce) + table = gen.gen_table() + assert False +except ValueError as e: + print(e) +print() + +# Nullable symbols just don't work with constructs like this, because you can't +# look ahead to figure out if you should reduce an empty 'F' or not. +grammar_nullable = [ + ("E", ["F", "boop"]), + ("F", ["beep"]), + ("F", []), +] +try: + gen = GenerateLR0("E", grammar_nullable) + table = gen.gen_table() + assert False +except ValueError as e: + print(e) + +gen = GenerateSLR1("E", grammar_lr0_shift_reduce) +print("First: {first}".format(first=str(gen.gen_first(["E"])))) +print("Follow: {follow}".format(follow=str(gen.gen_follow("E")))) +table = gen.gen_table() +print(format_table(gen, table)) +tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) +print(format_node(tree) + "\n") +print() + +# SLR1 can't handle this. +grammar_aho_ullman_1 = [ + ("S", ["L", "=", "R"]), + ("S", ["R"]), + ("L", ["*", "R"]), + ("L", ["id"]), + ("R", ["L"]), +] +try: + gen = GenerateSLR1("S", grammar_aho_ullman_1) + table = gen.gen_table() + assert False +except ValueError as e: + print(e) +print() + +# Here's an example with a full LR1 grammar, though. +grammar_aho_ullman_2 = [ + ("S", ["X", "X"]), + ("X", ["a", "X"]), + ("X", ["b"]), +] +gen = GenerateLR1("S", grammar_aho_ullman_2) +table = gen.gen_table() +print(format_table(gen, table)) +parse(table, ["b", "a", "a", "b"], trace=True) +print() + +# What happens if we do LALR to it? +gen = GenerateLALR("S", grammar_aho_ullman_2) +table = gen.gen_table() +print(format_table(gen, table)) +print() + +# A fun LALAR grammar. +grammar_lalr = [ + ("S", ["V", "E"]), + ("E", ["F"]), + ("E", ["E", "+", "F"]), + ("F", ["V"]), + ("F", ["int"]), + ("F", ["(", "E", ")"]), + ("V", ["id"]), +] +gen = GenerateLALR("S", grammar_lalr) +table = gen.gen_table() +print(format_table(gen, table)) +print() diff --git a/parser.py b/parser.py index 656ef09..8091fb7 100644 --- a/parser.py +++ b/parser.py @@ -1,18 +1,124 @@ -"""A collection of LR parser generators, from LR0 through LALR. +"""This is a small helper library to generate LR parser tables. -One day I read a tweet, asking for a tool which accepted a grammar and an -input file and which then produced simple parsed output, without any kind of -in-between. (There was other ranty stuff about how none of the existing tools -really worked, but that was beside the point.) +The primary inspiration for this library is tree-sitter, which also generates +LR parsers for grammars written in a turing-complete language. Like that, we +write grammars in a language, only we do it in Python instead of JavaScript. -Upon reading the tweet, it occured to me that I didn't know how LR parsers -worked and how they were generated, except in the broadest of terms. Thus, I -set about writing this, learning as I went. +Why Python? Because Python 3 is widely pre-installed on MacOS and Linux. This +library requires nothing more than the basic standard library, and not even a +new version of it. Therefore, it turns out to be a pretty light dependency for +a rust or C++ or something kind of project. (Tree-sitter, on the other hand, +requires node, which is a far less stable and available runtime in 2024.) -This code is not written to be fast, or even efficient, although it runs its -test cases fast enough. It was instead written to be easy to follow along -with, so that when I forget how all this works I can come back to the code -and read along and learn all over again. +The parser tables can really be used to power anything. I prefer to make +concrete syntax trees (again, see tree-sitter), and there is no facility at all +for actions or custom ASTs or whatnot. Any such processing needs to be done by +the thing that processes the tables. + +## Making Grammars + +To get started, create a grammar that derives from the `Grammar` class. Create +one method per nonterminal, decorated with the `rule` decorator. Here's an +example: + + PLUS = Token('+') + LPAREN = Token('(') + RPAREN = Token(')') + ID = Token('id') + + class SimpleGrammar(Grammar): + @rule + def expression(self): + return seq(self.expression, PLUS, self.term) | self.term + + @rule + def term(self): + return seq(LPAREN, self.expression, RPAREN) | ID + + +## Using grammars + +TODO + +## Representation Choices + +The SimpleGrammar class might seem a little verbose compared to a dense +structure like: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + +or + + grammar_simple = { + 'E': [ + ['E', '+', 'T'], + ['T'], + ], + 'T': [ + ['(', 'E', ')'], + ['id'], + ], + } + + +The advantage that the class has over a table like this is that you get to have +all of your Python tools help you make sure your grammar is good, if you want +them. e.g., if you're working with an LSP or something, the members give you +autocomplete and jump-to-definition and possibly even type-checking. + +At the very least, if you mis-type the name of a nonterminal, or forget to +implement it, we will immediately raise an error that *INCLUDES THE LOCATION IN +THE SOURCE WHERE THE ERROR WAS MADE.* With tables, we can tell you that you +made a mistake but it's up to you to figure out where you did it. + +### Aside: What about a custom DSL/EBNF like thing? + +Yeah, OK, there's a rich history of writing your grammar in a domain-specific +language. YACC did it, ANTLR does it, GRMTools.... just about everybody except +Tree-Sitter does this. + +But look, I've got several reasons for not doing it. + +First, I'm lazy, and don't want to write yet another parser for my parser. What +tools should I use to write my parser generator parser? I guess I don't have my +parser generator parser yet, so probably a hand-written top down parser? Some +other python parser generator? Ugh! + +As an add-on to that, if I make my own format then I need to make tooling for +*that* too: syntax highlighters, jump to definition, the works. Yuck. An +existing language, and a format that builds on an existing language, gets me the +tooling that comes along with that language. If you can leverage that +effictively (and I think I have) then you start way ahead in terms of tooling. + +Second, this whole thing is supposed to be easy to include in an existing +project, and adding a custom compiler doesn't seem to be that. Adding two python +files seems to be about the right speed. + +Thirdly, and this is just hypothetical, it's probably pretty easy to write your +own tooling around a grammar if it's already in Python. If you want to make +railroad diagrams or EBNF pictures or whatever, all the productions are already +right there in data structures for you to process. I've tried to keep them +accessible and at least somewhat easy to work with. There's nothing that says a +DSL-based system *has* to produce unusable intermediate data- certainly there +are some tools that *try*- but with this approach the accessibility and the +ergonomics of the tool go hand in hand. + +## Some History + +The first version of this code was written as an idle exercise to learn how LR +parser table generation even worked. It was... very simple, fairly easy to +follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow +for anything but the most trivial grammar. + +As a result, when I decided I wanted to use it for a larger grammar, I found that +I just couldn't. So this has been hacked and significantly improved from that +version, now capable of building tables for nontrivial grammars. It could still +be a lot faster, but it meets my needs for now. (BTW, the notes I read to learn how all this works are at http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, @@ -20,12 +126,17 @@ I started with handout 8, 'Bottom-up-parsing', and went from there. (I did eventually have to backtrack a little into handout 7, since that's where First() and Follow() are covered.) -Enjoy! - -doty -2016-12-09 +May 2024 """ -from collections import namedtuple + +import abc +import collections +import dataclasses +import enum +import functools +import inspect +import sys +import typing ############################################################################### @@ -33,132 +144,624 @@ from collections import namedtuple # # We start with LR0 parsers, because they form the basis of everything else. ############################################################################### -class Configuration( - namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead']) -): - """A rule being tracked in a state. +class Configuration: + """A rule being tracked in a state. That is, a specific position within a + specific rule, with an associated lookahead state. + + We make a *lot* of these and we need/want to pre-cache a ton of things we + ask about so we need to override __init__, otherwise it's immutable and + fixed and doesn't have a dict to save space. + + It also supports hashing and equality and comparison, so it can be sorted + and whatnot. This really is the workhorse data structure of the whole thing. + If you can improve this you can improve the performance of everything probably. (Note: technically, lookahead isn't used until we get to LR(1) parsers, but if left at its default it's harmless. Ignore it until you get to the part about LR(1).) """ - __slots__ = () + + __slots__ = ( + "name", + "symbols", + "position", + "lookahead", + "next", + "at_end", + "_vals", + "_hash", + ) + + name: int + symbols: typing.Tuple[int, ...] + position: int + lookahead: typing.Tuple[int, ...] + next: int | None + at_end: bool + + _vals: typing.Tuple + _hash: int + + def __init__(self, name, symbols, position, lookahead) -> None: + self.name = name + self.symbols = symbols + self.position = position + self.lookahead = lookahead + + at_end = position == len(symbols) + self.at_end = at_end + self.next = symbols[position] if not at_end else None + + self._vals = (name, symbols, position, lookahead) + self._hash = hash(self._vals) @classmethod - def from_rule(cls, rule, lookahead=()): + def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()): return Configuration( - name=rule[0], - symbols=rule[1], + name=name, + symbols=symbols, position=0, lookahead=lookahead, ) - @property - def at_end(self): - return self.position == len(self.symbols) + def __hash__(self) -> int: + return self._hash - @property - def next(self): - return self.symbols[self.position] if not self.at_end else None + def __eq__(self, value: object, /) -> bool: + if value is self: + return True + if not isinstance(value, Configuration): + return NotImplemented + + return ( + value._hash == self._hash + and value.name == self.name + and value.position == self.position + and value.symbols == self.symbols + and value.lookahead == self.lookahead + ) + + def __lt__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals < value._vals + + def __gt__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals > value._vals + + def __le__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals <= value._vals + + def __ge__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals >= value._vals + + def replace_position(self, new_position): + return Configuration( + name=self.name, + symbols=self.symbols, + position=new_position, + lookahead=self.lookahead, + ) + + def clear_lookahead(self): + return Configuration( + name=self.name, + symbols=self.symbols, + position=self.position, + lookahead=(), + ) @property def rest(self): - return self.symbols[(self.position+1):] + return self.symbols[(self.position + 1) :] - def at_symbol(self, symbol): - return self.next == symbol - - def replace(self, **kwargs): - return self._replace(**kwargs) - - def __str__(self): - la = ", " + str(self.lookahead) if self.lookahead != () else "" + def format(self, alphabet: list[str]) -> str: + la = ", " + str(tuple(alphabet[i] for i in self.lookahead)) if self.lookahead != () else "" return "{name} -> {bits}{lookahead}".format( - name=self.name, - bits=' '.join([ - '* ' + sym if i == self.position else sym - for i, sym in enumerate(self.symbols) - ]) + (' *' if self.at_end else ''), + name=alphabet[self.name], + bits=" ".join( + [ + "* " + alphabet[sym] if i == self.position else alphabet[sym] + for i, sym in enumerate(self.symbols) + ] + ) + + (" *" if self.at_end else ""), lookahead=la, ) -class GenerateLR0(object): - """Generate parser tables for an LR0 parser. +ConfigSet = typing.Tuple[Configuration, ...] - The input grammars are of the form: - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] +class ConfigurationSetInfo: + """When we build a grammar into a table, the first thing we need to do is + generate all the configuration sets and their successors. - Which is to say, they are a list of productions. Each production is a - tuple where the first element of the tuple is the name of the - non-terminal being added, and the second elment of the tuple is the - list of terminals and non-terminals that make up the production. + (A configuration set is what it sounds like: an unordered set of + Configuration structures. But we use Tuple because it's hashable and + immutable and small and we order the Tuples so that we get repeatable + results.) - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. + *This* is structure that tracks the result of that computation. - Don't name anything with double-underscores; those are reserved for - the generator. Don't add '$' either, as it is reserved to mean - end-of-stream. Use an empty list to indicate nullability, that is: - - ('O', []), - - means that O can be matched with nothing. - - Implementation notes: - - This is implemented in the dumbest way possible, in order to be the - most understandable it can be. I built this to learn, and I want to - make sure I can keep learning with it. - - - We tend to use tuples everywhere. This is because tuples can be - compared for equality and put into tables and all that jazz. They might - be a little bit slower in places but like I said, this is for - learning. (Also, if we need this to run faster we can probably go a - long way by memoizing results, which is much easier if we have tuples - everywhere.) + (Different generators vary in the details of how they generate this + structure, but they all compute this information.) """ - def __init__(self, start, grammar): + + config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index + sets: list[ConfigSet] # Map the index back into a set + + # All the sucessors for all of the sets. `successors[i]` is the mapping + # from grammar symbol to the index of the set you get by processing that + # symbol. + successors: list[dict[int, int]] + + def __init__(self): + self.config_set_key = {} + self.sets = [] + self.successors = [] + + def register_config_set(self, c: ConfigSet) -> typing.Tuple[int, bool]: + """Potentially add a new config set to the set of sets. Returns the + canonical ID of the set within this structure, along with a boolean + indicating whether the set was just added or not. + + (You can use this integer to get the set back, if you need it, and + also access the successors table.) + """ + existing = self.config_set_key.get(c) + if existing is not None: + return existing, False + + index = len(self.sets) + self.sets.append(c) + self.successors.append({}) + self.config_set_key[c] = index + return index, True + + def add_successor(self, c_id: int, symbol: int, successor: int): + """Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id + is the id of the set in this structure, and symbol is the id of a + symbol in the alphabet of the grammar. + """ + self.successors[c_id][symbol] = successor + + def find_path_to_set(self, target_set: ConfigSet) -> list[int]: + """Trace the path of grammar symbols from the first set (which always + set 0) to the target set. This is useful in conflict reporting, + because we'll be *at* a ConfigSet and want to show the grammar symbols + that get us to where we found the conflict. + + The return value is a list of grammar symbols to get to the specified + ConfigSet. + + This function raises KeyError if no path is found. + """ + target_index = self.config_set_key[target_set] + visited = set() + + queue: collections.deque = collections.deque() + queue.appendleft((0, [])) + while len(queue) > 0: + set_index, path = queue.pop() + if set_index == target_index: + return path + + if set_index in visited: + continue + visited.add(set_index) + + for symbol, successor in self.successors[set_index].items(): + queue.appendleft((successor, path + [symbol])) + + raise KeyError("Unable to find a path to the target set!") + + +class Assoc(enum.Enum): + """Associativity of a rule.""" + + NONE = 0 + LEFT = 1 + RIGHT = 2 + + +class ErrorCollection: + """A collection of errors. The errors are grouped by config set and alphabet + symbol, so that we can group the error strings appropriately when we format + the error. + """ + + errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]] + + def __init__(self): + self.errors = {} + + def any(self) -> bool: + """Return True if there are any errors in this collection.""" + return len(self.errors) > 0 + + def add_error( + self, + config_set: ConfigSet, + symbol: int, + config: Configuration, + action: typing.Tuple, + ): + """Add an error to the collection. + + config_set is the set with the error. + symbol is the symbol we saw when we saw the error. + config is the configuration that we were in when we saw the error. + action is what we were trying to do. + + (This all makes more sense from inside the TableBuilder.) + """ + set_errors = self.errors.get(config_set) + if set_errors is None: + set_errors = {} + self.errors[config_set] = set_errors + + symbol_errors = set_errors.get(symbol) + if symbol_errors is None: + symbol_errors = {} + set_errors[symbol] = symbol_errors + + symbol_errors[config] = action + + def format( + self, + alphabet: list[str], + all_sets: ConfigurationSetInfo, + ) -> str | None: + """Format all the errors into a string, or return None if there are no + errors. + + We need the alphabet to turn all these integers into something human + readable, and all the sets to trace a path to where the errors were + encountered. + """ + if len(self.errors) is None: + return None + + errors = [] + for config_set, set_errors in self.errors.items(): + path = all_sets.find_path_to_set(config_set) + path_str = " ".join(alphabet[s] for s in path) + + for symbol, symbol_errors in set_errors.items(): + lines = [] + lines.append( + f"When we have parsed '{path_str}' and see '{alphabet[symbol]}' we don't know whether:" + ) + for config, action in symbol_errors.items(): + name = alphabet[config.name] + rule = " ".join( + f"{'* ' if config.position == i else ''}{alphabet[s]}" + for i, s in enumerate(config.symbols) + ) + if config.next is None: + rule += " *" + + if action[0] == "reduce": + action_str = f"pop {action[2]} values off the stack and make a {action[1]}" + elif action[0] == "shift": + action_str = "consume the token and keep going" + elif action[0] == "accept": + action_str = "accept the parse" + else: + assert action[0] == "goto", f"Unknown action {action[0]}" + raise Exception("Shouldn't conflict on goto ever") + + lines.append( + f" - We are in the rule `{name}: {rule}` and we should {action_str}" + ) + + errors.append("\n".join(lines)) + + return "\n\n".join(errors) + + +class TableBuilder(object): + """A helper object to assemble actions into build parse tables. + + This is a builder type thing: call `new_row` at the start of + each row, then `flush` when you're done with the last row. + """ + + errors: ErrorCollection + table: list[dict[str, typing.Tuple]] + alphabet: list[str] + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] + row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]] + + def __init__( + self, + alphabet: list[str], + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...], + ): + self.errors = ErrorCollection() + self.table = [] + self.alphabet = alphabet + self.precedence = precedence + self.row = None + + def flush(self, all_sets: ConfigurationSetInfo) -> list[dict[str, typing.Tuple]]: + """Finish building the table and return it. + + Raises ValueError if there were any conflicts during construction. + """ + self._flush_row() + if self.errors.any(): + errors = self.errors.format(self.alphabet, all_sets) + raise ValueError(f"Errors building the table:\n\n{errors}") + return self.table + + def new_row(self, config_set: ConfigSet): + """Start a new row, processing the given config set. Call this before + doing anything else. + """ + self._flush_row() + self.row = [(None, None) for _ in self.alphabet] + self.current_config_set = config_set + + def _flush_row(self): + if self.row: + actions = {self.alphabet[k]: v[0] for k, v in enumerate(self.row) if v[0] is not None} + self.table.append(actions) + + def set_table_reduce(self, symbol: int, config: Configuration): + """Mark a reduce of the given configuration for the given symbol in the + current row. + """ + action = ("reduce", self.alphabet[config.name], len(config.symbols)) + self._set_table_action(symbol, action, config) + + def set_table_accept(self, symbol: int, config: Configuration): + """Mark a accept of the given configuration for the given symbol in the + current row. + """ + action = ("accept",) + self._set_table_action(symbol, action, config) + + def set_table_shift(self, symbol: int, index: int, config: Configuration): + """Mark a shift in the current row of the given given symbol to the + given index. The configuration here provides debugging informtion for + conflicts. + """ + action = ("shift", index) + self._set_table_action(symbol, action, config) + + def set_table_goto(self, symbol: int, index: int): + """Set the goto for the given nonterminal symbol in the current row.""" + action = ("goto", index) + self._set_table_action(symbol, action, None) + + def _action_precedence(self, symbol: int, action: typing.Tuple, config: Configuration): + if action[0] == "shift": + return self.precedence[symbol] + else: + return self.precedence[config.name] + + def _set_table_action(self, symbol_id: int, action: typing.Tuple, config: Configuration | None): + """Set the action for 'symbol' in the table row to 'action'. + + This is destructive; it changes the table. It records an error if + there is already an action for the symbol in the row. + """ + assert isinstance(symbol_id, int) + + assert self.row is not None + existing, existing_config = self.row[symbol_id] + if existing is not None and existing != action: + assert existing_config is not None + assert config is not None + + existing_assoc, existing_prec = self._action_precedence( + symbol_id, existing, existing_config + ) + new_assoc, new_prec = self._action_precedence(symbol_id, action, config) + + if existing_prec > new_prec: + # Precedence of the action in the table already wins, do nothing. + return + + elif existing_prec == new_prec: + # It's an actual conflict, use associativity if we can. + # If there's a conflict in associativity then it's a real conflict! + assoc = Assoc.NONE + if existing_assoc == Assoc.NONE: + assoc = new_assoc + elif new_assoc == Assoc.NONE: + assoc = existing_assoc + elif new_assoc == existing_assoc: + assoc = new_assoc + + resolved = False + if assoc == Assoc.LEFT: + # Prefer reduce over shift + if action[0] == "shift" and existing[0] == "reduce": + action = existing + resolved = True + elif action[0] == "reduce" and existing[0] == "shift": + resolved = True + + elif assoc == Assoc.RIGHT: + # Prefer shift over reduce + if action[0] == "shift" and existing[0] == "reduce": + resolved = True + elif action[0] == "reduce" and existing[0] == "shift": + action = existing + resolved = True + + if not resolved: + # Record the conflicts. + self.errors.add_error( + self.current_config_set, symbol_id, existing_config, existing + ) + self.errors.add_error(self.current_config_set, symbol_id, config, action) + + else: + # Precedence of the new action is greater than the existing + # action, just allow the overwrite with no change. + pass + + self.row[symbol_id] = (action, config) + + +class GenerateLR0(object): + """Generate parser tables for an LR0 parser.""" + + # Internally we use integers as symbols, not strings. Mostly this is fine, + # but when we need to map back from integer to string we index this list. + alphabet: list[str] + + # The grammar we work with. The outer list is indexed by grammar symbol, + # terminal *and* non-terminal. The inner list is the list of productions + # for the given nonterminal symbol. (If you have a terminal `t` and look it + # up you'll just get an empty list.) + grammar: list[list[typing.Tuple[int, ...]]] + + # nonterminal[i] is True if alphabet[i] is a nonterminal. + nonterminal: typing.Tuple[bool, ...] + # The complement of nonterminal. terminal[i] is True if alphabet[i] is a + # terminal. + terminal: typing.Tuple[bool, ...] + + # The precedence of every symbol. If no precedence was explicitly provided + # for a symbol, then its entry in this tuple will be (NONE, 0). + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] + + # The lookup that maps a particular symbol to an integer. (Only really used + # for debugging.) + symbol_key: dict[str, int] + # The start symbol of the grammar. + start_symbol: int + # The end symbol of the grammar. + end_symbol: int + + config_sets_key: dict[ConfigSet, int] + successors: list[set[int]] + + def __init__( + self, + start: str, + grammar: list[typing.Tuple[str, list[str]]], + precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, + ): """Initialize the parser generator with the specified grammar and start symbol. + + The input grammars are of the form: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + + Which is to say, they are a list of productions. Each production is a + tuple where the first element of the tuple is the name of the + non-terminal being added, and the second elment of the tuple is the + list of terminals and non-terminals that make up the production. + + There is currently no support for custom actions or alternation or + anything like that. If you want alternations that you'll have to lower + the grammar by hand into the simpler form first. + + Don't name anything with double-underscores; those are reserved for + the generator. Don't add '$' either, as it is reserved to mean + end-of-stream. Use an empty list to indicate nullability, that is: + + ('O', []), + + means that O can be matched with nothing. + + This isn't a *great* way to author these things, but it is very simple + and flexible. You probably don't want to author this on your own; see + the Grammar class for a high-level API. + + The precedence dictionary, if provided, maps a given symbol to an + associativity and a precedence. Any symbol not in the dictionary is + presumed to have an associativity of NONE and a precedence of zero. """ - # We always store the "augmented" grammar, which contains an initial - # production for the start state. grammar[0] is always the start - # rule, and in the set of states and table and whatever the first - # element is always the starting state/position. - self.grammar = [('__start', [start])] + grammar - self.nonterminals = {rule[0] for rule in grammar} - self.terminals = { - sym - for name, symbols in grammar - for sym in symbols - if sym not in self.nonterminals - } - self.alphabet = self.terminals | self.nonterminals + + # Work out the alphabet. + alphabet = set() + for name, rule in grammar: + alphabet.add(name) + alphabet.update(symbol for symbol in rule) # Check to make sure they didn't use anything that will give us # heartburn later. - reserved = [a for a in self.alphabet if a.startswith('__') or a == '$'] + reserved = [a for a in alphabet if a.startswith("__") or a == "$"] if reserved: raise ValueError( "Can't use {symbols} in grammars, {what} reserved.".format( - symbols=' or '.join(reserved), + symbols=" or ".join(reserved), what="it's" if len(reserved) == 1 else "they're", ) ) - self.terminals.add('$') - self.alphabet.add('$') + alphabet.add("__start") + alphabet.add("$") + self.alphabet = list(sorted(alphabet)) - def gen_closure_next(self, config): - """Return the next set of configurations in the closure for - config. + symbol_key = {symbol: index for index, symbol in enumerate(self.alphabet)} + + start_symbol = symbol_key["__start"] + end_symbol = symbol_key["$"] + + assert self.alphabet[start_symbol] == "__start" + assert self.alphabet[end_symbol] == "$" + + # Turn the incoming grammar into a dictionary, indexed by nonterminal. + # + # We count on python dictionaries retaining the insertion order, like + # it or not. + full_grammar: list[list] = [list() for _ in self.alphabet] + terminal: list[bool] = [True for _ in self.alphabet] + assert terminal[end_symbol] + + nonterminal = [False for _ in self.alphabet] + + for name, rule in grammar: + name_symbol = symbol_key[name] + + terminal[name_symbol] = False + nonterminal[name_symbol] = True + + rules = full_grammar[name_symbol] + rules.append(tuple(symbol_key[symbol] for symbol in rule)) + + self.grammar = full_grammar + self.grammar[start_symbol].append((symbol_key[start],)) + terminal[start_symbol] = False + nonterminal[start_symbol] = True + + self.terminal = tuple(terminal) + self.nonterminal = tuple(nonterminal) + + assert self.terminal[end_symbol] + assert self.nonterminal[start_symbol] + + if precedence is None: + precedence = {} + self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) + + self.symbol_key = symbol_key + self.start_symbol = start_symbol + self.end_symbol = end_symbol + + @functools.cache + def gen_closure_next(self, config: Configuration): + """Return the next set of configurations in the closure for config. If the position for config is just before a non-terminal, then the next set of configurations is configurations for all of the @@ -166,96 +769,117 @@ class GenerateLR0(object): beginning. (If the position for config is just before a terminal, or at the end of the production, then the next set is empty.) """ - if config.at_end: + next = config.next + if next is None: return () else: - return tuple( - Configuration.from_rule(rule) - for rule in self.grammar - if rule[0] == config.next - ) + return tuple(Configuration.from_rule(next, rule) for rule in self.grammar[next]) - def gen_closure(self, config, closure): - """Compute the closure for the specified config and unify it with the - existing closure. + def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: + """Compute the closure for the specified configs. The closure is all + of the configurations we could be in. Specifically, if the position + for a config is just before a non-terminal then we must also consider + configurations where the rule is the rule for the non-terminal and + the position is just before the beginning of the rule. - If the provided config is already in the closure then nothing is - done. (We assume that the closure of the config is *also* already in - the closure.) + (We have replaced a recursive version with an iterative one.) """ - if config in closure: - return closure - else: - new_closure = tuple(closure) + (config,) - for next_config in self.gen_closure_next(config): - new_closure = self.gen_closure(next_config, new_closure) - return new_closure + closure = set() + pending = list(seeds) + pending_next = [] + while len(pending) > 0: + for config in pending: + if config in closure: + continue - def gen_successor(self, config_set, symbol): + closure.add(config) + for next_config in self.gen_closure_next(config): + pending_next.append(next_config) + + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() + + return tuple(sorted(closure)) # TODO: Why tuple? + + def gen_successor(self, config_set: typing.Iterable[Configuration], symbol: int) -> ConfigSet: """Compute the successor state for the given config set and the given symbol. The successor represents the next state of the parser after seeing the symbol. """ - seeds = [ - config.replace(position=config.position + 1) + seeds = tuple( + config.replace_position(config.position + 1) for config in config_set - if config.at_symbol(symbol) - ] - - closure = () - for seed in seeds: - closure = self.gen_closure(seed, closure) + if config.next == symbol + ) + closure = self.gen_closure(seeds) return closure - def gen_all_successors(self, config_set): - """Return all of the non-empty successors for the given config set.""" + def gen_all_successors( + self, config_set: typing.Iterable[Configuration] + ) -> list[typing.Tuple[int, ConfigSet]]: + """Return all of the non-empty successors for the given config set. + + (That is, given the config set, pretend we see all the symbols we + could possibly see, and figure out which configs sets we get from + those symbols. Those are the successors of this set.) + """ + possible = tuple(sorted({config.next for config in config_set if config.next is not None})) + next = [] - for symbol in self.alphabet: + for symbol in possible: successor = self.gen_successor(config_set, symbol) if len(successor) > 0: - next.append(successor) + next.append((symbol, successor)) - return tuple(next) + return next - def gen_sets(self, config_set, F): - """Recursively generate all configuration sets starting from the - provided set, and merge them with the provided set 'F'. - """ - if config_set in F: - return F - else: - new_F = F + (config_set,) - for successor in self.gen_all_successors(config_set): - new_F = self.gen_sets(successor, new_F) + def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo: + """Generate all configuration sets starting from the provided set.""" + result = ConfigurationSetInfo() - return new_F + successors = [] + pending = [config_set] + pending_next = [] + while len(pending) > 0: + for config_set in pending: + id, is_new = result.register_config_set(config_set) + if is_new: + for symbol, successor in self.gen_all_successors(config_set): + successors.append((id, symbol, successor)) + pending_next.append(successor) - def gen_all_sets(self): + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() + + for id, symbol, successor in successors: + result.add_successor(id, symbol, result.config_set_key[successor]) + + return result + + def gen_all_sets(self) -> ConfigurationSetInfo: """Generate all of the configuration sets for the grammar.""" - initial_set = self.gen_closure( - Configuration.from_rule(self.grammar[0]), - (), + seeds = tuple( + Configuration.from_rule(self.start_symbol, rule) + for rule in self.grammar[self.start_symbol] ) - return self.gen_sets(initial_set, ()) + initial_set = self.gen_closure(seeds) + return self.gen_sets(initial_set) - def find_set_index(self, sets, set): - """Find the specified set in the set of sets, and return the - index, or None if it is not found. - """ - for i, s in enumerate(sets): - if s == set: - return i - return None - - def gen_reduce_set(self, config): + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: """Return the set of symbols that indicate we should reduce the given configuration. - In an LR0 parser, this is just the set of all terminals.""" - return self.terminals + In an LR0 parser, this is just the set of all terminals. + """ + del config + return [index for index, value in enumerate(self.terminal) if value] def gen_table(self): """Generate the parse table. @@ -285,89 +909,32 @@ class GenerateLR0(object): Anything missing from the row indicates an error. """ - action_table = [] config_sets = self.gen_all_sets() - for config_set in config_sets: - actions = {} + builder = TableBuilder(self.alphabet, self.precedence) + + for config_set_id, config_set in enumerate(config_sets.sets): + builder.new_row(config_set) + successors = config_sets.successors[config_set_id] - # Actions for config in config_set: - if config.at_end: - if config.name != '__start': + config_next = config.next + if config_next is None: + if config.name != self.start_symbol: for a in self.gen_reduce_set(config): - self.set_table_action( - actions, - a, - ('reduce', config.name, len(config.symbols)), - config, - ) + builder.set_table_reduce(a, config) else: - self.set_table_action( - actions, - '$', - ('accept',), - config, - ) + builder.set_table_accept(self.end_symbol, config) - else: - if config.next in self.terminals: - successor = self.gen_successor(config_set, config.next) - index = self.find_set_index(config_sets, successor) - self.set_table_action( - actions, - config.next, - ('shift', index), - config, - ) + elif self.terminal[config_next]: + index = successors[config_next] + builder.set_table_shift(config_next, index, config) # Gotos - for symbol in self.nonterminals: - successor = self.gen_successor(config_set, symbol) - index = self.find_set_index(config_sets, successor) - if index is not None: - self.set_table_action( - actions, - symbol, - ('goto', index), - None, - ) + for symbol, index in successors.items(): + if self.nonterminal[symbol]: + builder.set_table_goto(symbol, index) - # set_table_action stores the configs that generated the actions in - # the table, for diagnostic purposes. This filters them out again - # so that the parser has something clean to work with. - actions = {k: self.get_table_action(actions, k) for k in actions} - action_table.append(actions) - - return action_table - - def set_table_action(self, row, symbol, action, config): - """Set the action for 'symbol' in the table row to 'action'. - - This is destructive; it changes the table. It raises an error if - there is already an action for the symbol in the row. - """ - existing, existing_config = row.get(symbol, (None, None)) - if existing is not None and existing != action: - config_old = str(existing_config) - config_new = str(config) - max_len = max(len(config_old), len(config_new)) + 1 - error = ( - "Conflicting actions for token '{symbol}':\n" - " {config_old: <{max_len}}: {old}\n" - " {config_new: <{max_len}}: {new}\n".format( - config_old=config_old, - config_new=config_new, - max_len=max_len, - old=existing, - new=action, - symbol=symbol, - ) - ) - raise ValueError(error) - row[symbol] = (action, config) - - def get_table_action(self, row, symbol): - return row[symbol][0] + return builder.flush(config_sets) def parse(table, input, trace=False): @@ -380,48 +947,53 @@ def parse(table, input, trace=False): input is a list of tokens. Don't stick an end-of-stream marker, I'll stick one on for you. + + This is not a *great* parser, it's really just a demo for what you can + do with the table. """ - assert '$' not in input - input = input + ['$'] + assert "$" not in input + input = input + ["$"] input_index = 0 # Our stack is a stack of tuples, where the first entry is the state number # and the second entry is the 'value' that was generated when the state was # pushed. - stack = [(0, None)] + stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] while True: current_state = stack[-1][0] current_token = input[input_index] - action = table[current_state].get(current_token, ('error',)) + action = table[current_state].get(current_token, ("error",)) if trace: - print("{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index:]), - action=repr(action) - )) + print( + "{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index:]), + action=repr(action), + ) + ) - if action[0] == 'accept': + if action[0] == "accept": return stack[-1][1] - elif action[0] == 'reduce': + elif action[0] == "reduce": name = action[1] size = action[2] value = (name, tuple(s[1] for s in stack[-size:])) stack = stack[:-size] - goto = table[stack[-1][0]].get(name, ('error',)) - assert goto[0] == 'goto' # Corrupt table? + goto = table[stack[-1][0]].get(name, ("error",)) + assert goto[0] == "goto" # Corrupt table? stack.append((goto[1], value)) - elif action[0] == 'shift': + elif action[0] == "shift": stack.append((action[1], (current_token, ()))) input_index += 1 - elif action[0] == 'error': + elif action[0] == "error": raise ValueError( - 'Syntax error: unexpected symbol {sym}'.format( + "Syntax error: unexpected symbol {sym}".format( sym=current_token, ), ) @@ -430,6 +1002,228 @@ def parse(table, input, trace=False): ############################################################################### # SLR(1) ############################################################################### +def update_changed(items: set[int], other: set[int]) -> bool: + """Merge the `other` set into the `items` set, and return True if this + changed the items set. + """ + old_len = len(items) + items.update(other) + return old_len != len(items) + + +@dataclasses.dataclass(frozen=True) +class FirstInfo: + """A structure that tracks the first set of a grammar. (Or, as it is + commonly styled in textbooks, FIRST.) + + firsts[s] is the set of first terminals of any particular nonterminal s. + (For a terminal , firsts[s] == s.) + + is_epsilon[s] is True if the nonterminal s can be empty, that is, if + it can match zero symbols. + + For example, consider following grammar: + + [ + ('x', ['y', 'A']), + ('y', ['z']), + ('y', ['B', 'x']), + ('y', []), + ('z', ['C']), + ('z', ['D', x]), + ] + + For this grammar, FIRST['z'] is ('C', 'D'). + + FIRST['y'] is ('B', 'C', 'D'). For the first production, 'z' is first, and + since 'z' is a nonterminal we need to include all of its symbols too, + transitively. For the second production, 'B' is first, and so that gets + added to the set. The last production doesn't have anything in it, so it + doesn't contribute to FIRST['y'], but it does set `is_epsilon` to True. + + Finally, FIRST['x'] is ('A', 'B', 'C', 'D'). ('B', 'C', 'D') comes from + FIRST['y'], as 'y' is first in our only production. But the 'A' comes from + the fact that is_epsilon['y'] is True: since 'y' can match empty input, + it is also legal for 'x' to begin with 'A'. + """ + + firsts: list[set[int]] + is_epsilon: list[bool] + + @classmethod + def from_grammar( + cls, + grammar: list[list[typing.Tuple[int, ...]]], + terminal: typing.Tuple[bool, ...], + ) -> "FirstInfo": + """Construct a new FirstInfo from the specified grammar. + + terminal[s] is True if symbol s is a terminal symbol. + """ + # Add all terminals to their own firsts + firsts: list[set[int]] = [] + for index, is_terminal in enumerate(terminal): + firsts.append(set()) + if is_terminal: + firsts[index].add(index) + + # Because we're working with recursive and mutually recursive rules, we + # need to make sure we terminate once we've actually found all the first + # symbols. Naive recursion will go forever, and recursion with a visited + # set to halt recursion ends up revisiting the same symbols over and + # over, running *very* slowly. Strangely, iteration to fixed-point turns + # out to be reasonably quick in practice, and is what every other parser + # generator uses in the end. + epsilons = [False for _ in terminal] + changed = True + while changed: + changed = False + for name, rules in enumerate(grammar): + f = firsts[name] + for rule in rules: + if len(rule) == 0: + changed = changed or not epsilons[name] + epsilons[name] = True + continue + + for index, symbol in enumerate(rule): + other_firsts = firsts[symbol] + changed = update_changed(f, other_firsts) or changed + + is_last = index == len(rule) - 1 + if is_last and epsilons[symbol]: + # If this is the last symbol and the last + # symbol can be empty then I can be empty + # too! :P + changed = changed or not epsilons[name] + epsilons[name] = True + + if not epsilons[symbol]: + # If we believe that there is at least one + # terminal in the first set of this + # nonterminal then I don't have to keep + # looping through the symbols in this rule. + break + + return FirstInfo(firsts=firsts, is_epsilon=epsilons) + + +@dataclasses.dataclass(frozen=True) +class FollowInfo: + """A structure that tracks the follow set of a grammar. (Or, again, as the + textbooks would have it, FOLLOW.) + + The follow set for a nonterminal is the set of terminals that can follow the + nonterminal in a valid sentence. The resulting set never contains epsilon + and is never empty, since we should always at least ground out at '$', which + is the end-of-stream marker. + + In order to compute follow, we need to find every place that a given + nonterminal appears in the grammar, and look at the first set of the symbol + that follows it. But if the first set of the symbol that follows it includes + epsilon, then we need to include the first of the symbol after *that*, and + so forth, until we finally either get to the end of the rule or we find some + symbol whose first doesn't include epsilon. + + If we get to the end of the rule before finding a symbol that doesn't include + epsilon, then we also need to include the follow of the nonterminal that + contains the rule itself. (Anything that follows this rule can follow the + symbol we're considering.) + + Consider this nonsense grammar: + + [ + ('s', ['x', 'A']), + + ('x', ['y', 'B']), + ('x', ['y', 'z']), + + ('y', ['x', 'C']), + + ('z', ['D']), + ('z', []), + ] + + In this grammar, FOLLOW['y'] is ('A', 'B', 'D'). 'B' comes from the first + production of 'x', that's easy. 'D' comes from the second production of 'x': + FIRST['z'] is ('D'), and so that goes into FOLLOW['y']. + + 'A' is the surprising one: it comes from the fact that FIRST['z'] contains + epsilon. Since 'z' can successfully match on empty input, we need to treat + 'y' as if it were at the end of 'x'. Anything that can follow 'x' can also + follow 'y'. Since 'A' is in FOLLOW['x'] (from the production 's'), then 'A' + is also in FOLLOW['y']. + + Note that the follow set of any nonterminal is never empty and never + contains epsilon: they all terminate at the end-of-stream marker eventually, + by construction. (The individual parser generators make sure to augment the + grammar so that this is true, and that's a main reason why they do it.) + """ + + follows: list[set[int]] + + @classmethod + def from_grammar( + cls, + grammar: list[list[typing.Tuple[int, ...]]], + terminal: typing.Tuple[bool, ...], + start_symbol: int, + end_symbol: int, + firsts: FirstInfo, + ): + follows: list[set[int]] = [set() for _ in grammar] + follows[start_symbol].add(end_symbol) + + # See the comment in FirstInfo for why this is the way it is, more or + # less. Iteration to fixed point handlily beats recursion with + # memoization. I'm as shocked and dismayed as you as you are, but it's + # nice to remember that fixed-point algorithms are good sometimes. + changed = True + while changed: + changed = False + for name, rules in enumerate(grammar): + for rule in rules: + # To do this more efficiently, we actually walk backwards + # through the rule. As long as we've still seen something + # with epsilon, then we need to add FOLLOW[name] to + # FOLLOW[symbol]. As soon as we see something *without* + # epsilon, we can stop doing that. (This is *way* more + # efficient than trying to figure out epsilon while walking + # forward.) + epsilon = True + prev_symbol = None + for symbol in reversed(rule): + f = follows[symbol] + if terminal[symbol]: + # This particular rule can't produce epsilon. + epsilon = False + prev_symbol = symbol + continue + + # While epsilon is still set, update the follow of + # this nonterminal with the follow of the production + # we're processing. (This also means that the follow + # of the last symbol in the production is the follow + # of the entire production, as it should be.) + if epsilon: + changed = update_changed(f, follows[name]) or changed + + # If we're not at the end of the list then the follow + # of the current symbol contains the first of the + # next symbol. + if prev_symbol is not None: + changed = update_changed(f, firsts.firsts[prev_symbol]) or changed + + # Now if there's no epsilon in this symbol there's no + # more epsilon in the rest of the sequence. + if not firsts.is_epsilon[symbol]: + epsilon = False + + prev_symbol = symbol + + return FollowInfo(follows=follows) + + class GenerateSLR1(GenerateLR0): """Generate parse tables for SLR1 grammars. @@ -440,115 +1234,48 @@ class GenerateSLR1(GenerateLR0): non-terminal. That means SLR1 parsers need to know how to generate 'follow(A)', which - means they need to know how to generate 'first(A)', which is most of the - code in this class. + means they need to know how to generate 'first(A)'. See FirstInfo and + FollowInfo for the details on how this is computed. """ - def gen_first_symbol(self, symbol, visited): - """Compute the first set for a single symbol. - If a symbol can be empty, then the set contains epsilon, which we - represent as python's `None`. + _firsts: FirstInfo + _follows: FollowInfo - The first set is the set of tokens that can appear as the first token - for a given symbol. (Obviously, if the symbol is itself a token, then - this is trivial.) - - 'visited' is a set of already visited symbols, to stop infinite - recursion on left-recursive grammars. That means that sometimes this - function can return an empty tuple. Don't confuse that with a tuple - containing epsilon: that's a tuple containing `None`, not an empty - tuple. + def __init__(self, *args, **kwargs): + """See the constructor of GenerateLR0 for an explanation of the + parameters to the constructor and what they mean. """ - if symbol in self.terminals: - return (symbol,) - elif symbol in visited: - return () - else: - assert symbol in self.nonterminals - visited.add(symbol) + super().__init__(*args, **kwargs) - # All the firsts from all the productions. - firsts = [ - self.gen_first(rule[1], visited) - for rule in self.grammar - if rule[0] == symbol - ] + # We store the firsts not because we need them here, but because LR1 + # and LALR need them. + self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal) + self._follows = FollowInfo.from_grammar( + self.grammar, + self.terminal, + self.start_symbol, + self.end_symbol, + self._firsts, + ) - result = () - for fs in firsts: - result = result + tuple(f for f in fs if f not in result) - - return tuple(sorted(result)) - - def gen_first(self, symbols, visited=None): - """Compute the first set for a sequence of symbols. - - The first set is the set of tokens that can appear as the first token - for this sequence of symbols. The interesting wrinkle in computing the - first set for a sequence of symbols is that we keep computing the first - sets so long as epsilon appears in the set. i.e., if we are computing - for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the - first set for the *sequence* also contains the first set of ['B', 'C'], - since 'A' could be missing entirely. - - An epsilon in the result is indicated by 'None'. There will always be - at least one element in the result. - - The 'visited' parameter, if not None, is a set of symbols that are - already in the process of being evaluated, to deal with left-recursive - grammars. (See gen_first_symbol for more.) - """ - if len(symbols) == 0: - return (None,) # Epsilon. - else: - if visited is None: - visited = set() - result = self.gen_first_symbol(symbols[0], visited) - if None in result: - result = tuple(s for s in result if s is not None) - result = result + self.gen_first(symbols[1:], visited) - result = tuple(sorted(set(result))) - return result - - def gen_follow(self, symbol, visited=None): + def gen_follow(self, symbol: int) -> set[int]: """Generate the follow set for the given nonterminal. The follow set for a nonterminal is the set of terminals that can follow the nonterminal in a valid sentence. The resulting set never contains epsilon and is never empty, since we should always at least ground out at '$', which is the end-of-stream marker. + + See FollowInfo for more information on how this is determined. """ - if symbol == '__start': - return tuple('$') + return self._follows.follows[symbol] - assert symbol in self.nonterminals - - # Deal with left-recursion. - if visited is None: - visited = set() - if symbol in visited: - return () - visited.add(symbol) - - follow = () - for production in self.grammar: - for index, prod_symbol in enumerate(production[1]): - if prod_symbol != symbol: - continue - - first = self.gen_first(production[1][index+1:]) - follow = follow + tuple(f for f in first if f is not None) - if None in first: - follow = follow + self.gen_follow(production[0], visited) - - assert None not in follow # Should always ground out at __start - return follow - - def gen_reduce_set(self, config): + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: """Return the set of symbols that indicate we should reduce the given config. - In an SLR1 parser, this is the follow set of the config nonterminal.""" + In an SLR1 parser, this is the follow set of the config nonterminal. + """ return self.gen_follow(config.name) @@ -563,16 +1290,39 @@ class GenerateLR1(GenerateSLR1): details. (Except for the start configuration, which has '$' as its lookahead.) """ - def gen_reduce_set(self, config): + + def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: + """Return the first set for a *sequence* of symbols. + + (This is more than FIRST: we need to know the first thing that can + happen in this particular sequence right here.) + + Build the set by combining the first sets of the symbols from left to + right as long as epsilon remains in the first set. If we reach the end + and every symbol has had epsilon, then this set also has epsilon. + + Otherwise we can stop as soon as we get to a non-epsilon first(), and + our result does not have epsilon. + """ + result = set() + for s in symbols: + result.update(self._firsts.firsts[s]) + if not self._firsts.is_epsilon[s]: + return (result, False) + + return (result, True) + + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: """Return the set of symbols that indicate we should reduce the given config. - In an LR1 parser, this is the lookahead of the configuration.""" + In an LR1 parser, this is the lookahead of the configuration. + """ return config.lookahead - def gen_closure_next(self, config): - """Return the next set of configurations in the closure for - config. + @functools.cache + def gen_closure_next(self, config: Configuration): + """Return the next set of configurations in the closure for config. In LR1 parsers, we must compute the lookahead for the configurations we're adding to the closure. The lookahead for the new configurations @@ -583,29 +1333,21 @@ class GenerateLR1(GenerateSLR1): from an upstream production in the grammar.) (See the documentation in GenerateLR0 for more information on how - this function fits into the whole process.) + this function fits into the whole process, specifically `gen_closure`.) """ - if config.at_end: + config_next = config.next + if config_next is None: return () else: next = [] - for rule in self.grammar: - if rule[0] != config.next: - continue + for rule in self.grammar[config_next]: + lookahead, epsilon = self.gen_first(config.rest) + if epsilon: + lookahead.update(config.lookahead) + lookahead_tuple = tuple(sorted(lookahead)) + next.append(Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)) - # N.B.: We can't just append config.lookahead to config.rest - # and compute first(), because lookahead is a *set*. So - # in this case we just say if 'first' contains epsilon, - # then we need to remove the epsilon and union with the - # existing lookahead. - lookahead = self.gen_first(config.rest) - if None in lookahead: - lookahead = tuple(l for l in lookahead if l is not None) - lookahead = lookahead + config.lookahead - lookahead = tuple(sorted(set(lookahead))) - next.append(Configuration.from_rule(rule, lookahead=lookahead)) - - return tuple(next) + return tuple(sorted(next)) def gen_all_sets(self): """Generate all of the configuration sets for the grammar. @@ -613,11 +1355,12 @@ class GenerateLR1(GenerateSLR1): In LR1 parsers, we must remember to set the lookahead of the start symbol to '$'. """ - initial_set = self.gen_closure( - Configuration.from_rule(self.grammar[0], lookahead=('$',)), - (), + seeds = tuple( + Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) + for rule in self.grammar[self.start_symbol] ) - return self.gen_sets(initial_set, ()) + initial_set = self.gen_closure(seeds) + return self.gen_sets(initial_set) class GenerateLALR(GenerateLR1): @@ -631,9 +1374,14 @@ class GenerateLALR(GenerateLR1): it does lose information. The advantage is that the number of parser states is much much smaller in LALR than in LR(1). + If you can get away with generating LALR tables for a grammar than you + should do it. + (Note that because we use immutable state everywhere this generator does - a lot of copying and allocation.) + a lot of copying and allocation. This particular generator could still + use a bunch of improvement, probably.) """ + def merge_sets(self, config_set_a, config_set_b): """Merge the two config sets, by keeping the item cores but merging the lookahead sets for each item. @@ -642,20 +1390,20 @@ class GenerateLALR(GenerateLR1): merged = [] for index, a in enumerate(config_set_a): b = config_set_b[index] - assert a.replace(lookahead=()) == b.replace(lookahead=()) + assert a.clear_lookahead() == b.clear_lookahead() new_lookahead = a.lookahead + b.lookahead new_lookahead = tuple(sorted(set(new_lookahead))) - merged.append(a.replace(lookahead=new_lookahead)) + merged.append(a.clear_lookahead()) return tuple(merged) def sets_equal(self, a, b): - a_no_la = tuple(s.replace(lookahead=()) for s in a) - b_no_la = tuple(s.replace(lookahead=()) for s in b) + a_no_la = tuple(s.clear_lookahead() for s in a) + b_no_la = tuple(s.clear_lookahead() for s in b) return a_no_la == b_no_la - def gen_sets(self, config_set, F): + def gen_sets(self, config_set) -> ConfigurationSetInfo: """Recursively generate all configuration sets starting from the provided set, and merge them with the provided set 'F'. @@ -665,28 +1413,331 @@ class GenerateLALR(GenerateLR1): then instead of returning F unchanged, we merge the two equal sets and replace the set in F, returning the modified set. """ - config_set_no_la = tuple(s.replace(lookahead=()) for s in config_set) - for index, existing in enumerate(F): - existing_no_la = tuple(s.replace(lookahead=()) for s in existing) - if config_set_no_la == existing_no_la: - merged_set = self.merge_sets(config_set, existing) - return F[:index] + (merged_set,) + F[index+1:] + F = {} + successors = [] + pending = [config_set] + while len(pending) > 0: + config_set = pending.pop() + config_set_no_la = tuple(s.clear_lookahead() for s in config_set) - # No merge candidate found, proceed. - new_F = F + (config_set,) - for successor in self.gen_all_successors(config_set): - new_F = self.gen_sets(successor, new_F) + existing = F.get(config_set_no_la) + if existing is not None: + F[config_set_no_la] = self.merge_sets(config_set, existing) + else: + F[config_set_no_la] = config_set + for symbol, successor in self.gen_all_successors(config_set): + successor_no_la = tuple(s.clear_lookahead() for s in successor) + successors.append((config_set_no_la, symbol, successor_no_la)) + pending.append(successor) - return new_F + # Register all the actually merged, final config sets. + result = ConfigurationSetInfo() + for config_set in F.values(): + result.register_config_set(config_set) - def find_set_index(self, sets, set): - """Find the specified set in the set of sets, and return the - index, or None if it is not found. + # Now record all the successors that we found. Of course, the actual + # sets that wound up in the ConfigurationSetInfo don't match anything + # we found during the previous phase. + # + # *Fortunately* we recorded the no-lookahead keys in the successors + # so we can find the final sets, then look them up in the registered + # sets, and actually register the successor. + for config_set_no_la, symbol, successor_no_la in successors: + actual_config_set = F[config_set_no_la] + from_index = result.config_set_key[actual_config_set] + + actual_successor = F[successor_no_la] + to_index = result.config_set_key[actual_successor] + + result.add_successor(from_index, symbol, to_index) + + return result + + +############################################################################### +# Sugar for constructing grammars +############################################################################### +# This is the "high level" API for constructing grammars. +class Rule: + """A token (terminal), production (nonterminal), or some other + combination thereof. Rules are composed and then flattened into + productions. + """ + + def __or__(self, other) -> "Rule": + return AlternativeRule(self, other) + + def __add__(self, other) -> "Rule": + return SequenceRule(self, other) + + @abc.abstractmethod + def flatten(self) -> typing.Generator[list["str | Token"], None, None]: + """Convert this potentially nested and branching set of rules into a + series of nice, flat symbol lists. + + e.g., if this rule is (X + (A | (B + C | D))) then flattening will + yield something like: + + ["X", "A"] + ["X", "B", "C"] + ["X", "B", "D"] + + Isn't that nice? + + Note that Token rules remain unchanged in the result: this is so we + can better distinguish terminals from nonterminals while processing + the grammar. """ - for i, s in enumerate(sets): - if self.sets_equal(s, set): - return i - return None + raise NotImplementedError() + + +class Token(Rule): + """A token, or terminal symbol in the grammar.""" + + value: str + + def __init__(self, value): + self.value = sys.intern(value) + + def flatten(self) -> typing.Generator[list[str], None, None]: + # We are just ourselves when flattened. + yield [self] + + +class NonTerminal(Rule): + """A non-terminal, or a production, in the grammar. + + You probably don't want to create this directly; instead you probably want + to use the `@rule` decorator to associate this with a function in your + grammar class. + """ + + def __init__(self, fn: typing.Callable[["Grammar"], Rule], name: str | None = None): + """Create a new NonTerminal. + + `fn` is the function that will yield the `Rule` which is the + right-hand-side of this production; it will be flattened with `flatten`. + `name` is the name of the production- if unspecified (or `None`) it will + be replaced with the `__name__` of the provided fn. + """ + self.fn = fn + self.name = name or fn.__name__ + + def generate_body(self, grammar) -> list[list[str | Token]]: + """Generate the body of the non-terminal. + + We do this by first calling the associated function in order to get a + Rule, and then flattening the Rule into the associated set of + productions. + """ + return [rule for rule in self.fn(grammar).flatten()] + + def flatten(self) -> typing.Generator[list[str | Token], None, None]: + # Although we contain multitudes, when flattened we're being asked in + # the context of some other production. Yield ourselves, and trust that + # in time we will be asked to generate our body. + yield [self.name] + + +class AlternativeRule(Rule): + """A rule that matches if one or another rule matches.""" + + def __init__(self, left: Rule, right: Rule): + self.left = left + self.right = right + + def flatten(self) -> typing.Generator[list[str], None, None]: + # All the things from the left of the alternative, then all the things + # from the right, never intermingled. + yield from self.left.flatten() + yield from self.right.flatten() + + +class SequenceRule(Rule): + """A rule that matches if a first part matches, followed by a second part. + Two things in order. + """ + + def __init__(self, first: Rule, second: Rule): + self.first = first + self.second = second + + def flatten(self) -> typing.Generator[list[str], None, None]: + # All the things in the prefix.... + for first in self.first.flatten(): + # ...potentially followed by all the things in the suffix. + for second in self.second.flatten(): + yield first + second + + +class NothingRule(Rule): + """A rule that matches no input. Nothing, the void. Don't make a new one of + these, you're probably better off just using the singleton `Nothing`. + """ + + def flatten(self) -> typing.Generator[list[str], None, None]: + # It's quiet in here. + yield [] + + +Nothing = NothingRule() + + +def seq(*args: list[Rule]) -> Rule: + """A rule that matches a sequence of rules. + + (A helper function that combines its arguments into nested sequences.) + """ + result = args[0] + for rule in args[1:]: + result = SequenceRule(result, rule) + return result + + +@typing.overload +def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... + + +@typing.overload +def rule(fn: typing.Callable) -> Rule: ... + + +def rule( + name_or_fn: None | str | typing.Callable = None, +) -> Rule | typing.Callable[[typing.Callable], Rule]: + """The decorator that marks a method in a Grammar object as a nonterminal + rule. + + As with all the best decorators, it can be called with or without arguments. + If called with one argument, that argument is a name that overrides the name + of the nonterminal, which defaults to the name of the function. + """ + + def _rule(callable): + return NonTerminal(callable, name) + + if callable(name_or_fn): + name = name_or_fn.__name__ + return _rule(name_or_fn) + else: + name = name_or_fn + return _rule + + +class Grammar: + """The base class for defining a grammar. + + Inherit from this, and and define members for your nonterminals, and then + use the `build_tables` method to construct the parse tables. + + + Here's an example of a simple grammar: + + PLUS = Token('+') + LPAREN = Token('(') + RPAREN = Token(')') + ID = Token('id') + + class SimpleGrammar(Grammar): + @rule + def expression(self): + return seq(self.expression, PLUS, self.term) | self.term + + @rule + def term(self): + return seq(LPAREN, self.expression, RPAREN) | ID + + Not very exciting, perhaps, but it's something. + """ + + def __init__(self, precedence: list[typing.Tuple[Assoc, list[Token | NonTerminal]]] = None): + if precedence is None: + precedence = getattr(self, "precedence", []) + + precedence_table = {} + for precedence, (associativity, symbols) in enumerate(precedence): + for symbol in symbols: + if isinstance(symbol, Token): + key = symbol.value + elif isinstance(symbol, NonTerminal): + key = symbol.name + else: + raise ValueError(f"{symbol} must be either a Token or a NonTerminal") + + precedence_table[key] = (associativity, precedence + 1) + + self._precedence = precedence_table + + def generate_nonterminal_dict(self, start: str) -> dict[str, list[list[str | Token]]]: + """Convert the rules into a dictionary of productions. + + Our table generators work on a very flat set of productions. This is the + first step in flattening the productions from the members: walk the rules + starting from the given start rule and flatten them, one by one, into a + dictionary that maps nonterminal rule name to its associated list of + productions. + """ + rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)) + nonterminals = {rule.name: rule for _, rule in rules} + + grammar = {} + + rule = nonterminals.get(start) + if rule is None: + raise ValueError(f"Cannot find a rule named '{start}'") + queue = [rule] + while len(queue) > 0: + rule = queue.pop() + if rule.name in grammar: + continue + + body = rule.generate_body(self) + for clause in body: + for symbol in clause: + if not isinstance(symbol, Token): + assert isinstance(symbol, str) + nonterminal = nonterminals.get(symbol) + if nonterminal is None: + raise ValueError(f"While processing {rule.name}: cannot find {symbol}") + queue.append(nonterminal) + + grammar[rule.name] = body + + return grammar + + def desugar(self, start: str) -> list[typing.Tuple[str, list[str]]]: + """Convert the rules into a flat list of productions. + + Our table generators work from a very flat set of productions. The form + produced by this function is one level flatter than the one produced by + generate_nonterminal_dict- less useful to people, probably, but it is + the input form needed by the Generator. + """ + temp_grammar = self.generate_nonterminal_dict(start) + + grammar = [] + for rule_name, clauses in temp_grammar.items(): + for clause in clauses: + new_clause = [] + for symbol in clause: + if isinstance(symbol, Token): + new_clause.append(symbol.value) + else: + new_clause.append(symbol) + + grammar.append((rule_name, new_clause)) + + return grammar + + def build_table(self, start: str, generator=GenerateLALR): + """Construct a parse table for this grammar, starting at the named + nonterminal rule. + """ + desugared = self.desugar(start) + + gen = generator(start, desugared, precedence=self._precedence) + table = gen.gen_table() + return table ############################################################################### @@ -694,173 +1745,182 @@ class GenerateLALR(GenerateLR1): ############################################################################### def format_node(node): """Print out an indented concrete syntax tree, from parse().""" - lines = [ - '{name}'.format(name=node[0]) - ] + [ - ' ' + line - for child in node[1] - for line in format_node(child).split('\n') + lines = ["{name}".format(name=node[0])] + [ + " " + line for child in node[1] for line in format_node(child).split("\n") ] - return '\n'.join(lines) + return "\n".join(lines) def format_table(generator, table): """Format a parser table so pretty.""" - def format_action(state, terminal): - action = state.get(terminal, ('error',)) - if action[0] == 'accept': - return 'accept' - elif action[0] == 'shift': - return 's' + str(action[1]) - elif action[0] == 'error': - return '' - elif action[0] == 'reduce': - return 'r' + str(action[1]) + def format_action(state, terminal): + action = state.get(terminal, ("error",)) + if action[0] == "accept": + return "accept" + elif action[0] == "shift": + return "s" + str(action[1]) + elif action[0] == "error": + return "" + elif action[0] == "reduce": + return "r" + str(action[1]) + + terminals = list(sorted(generator.alphabet[i] for i, v in enumerate(generator.terminal) if v)) + nonterminals = list( + sorted(generator.alphabet[i] for i, v in enumerate(generator.nonterminal) if v) + ) header = " | {terms} | {nts}".format( - terms=' '.join( - '{0: <6}'.format(terminal) - for terminal in sorted(generator.terminals) - ), - nts=' '.join( - '{0: <5}'.format(nt) - for nt in sorted(generator.nonterminals) - ), + terms=" ".join("{0: <6}".format(terminal) for terminal in terminals), + nts=" ".join("{0: <5}".format(nt) for nt in nonterminals), ) lines = [ header, - '-' * len(header), + "-" * len(header), ] + [ "{index: <3} | {actions} | {gotos}".format( index=i, - actions=' '.join( - '{0: <6}'.format(format_action(row, terminal)) - for terminal in sorted(generator.terminals) - ), - gotos=' '.join( - '{0: <5}'.format(row.get(nt, ('error', ''))[1]) - for nt in sorted(generator.nonterminals) + actions=" ".join( + "{0: <6}".format(format_action(row, terminal)) for terminal in terminals ), + gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals), ) for i, row in enumerate(table) ] - return '\n'.join(lines) + return "\n".join(lines) ############################################################################### # Examples ############################################################################### -# OK, this is a very simple LR0 grammar. -grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), -] +def examples(): + def dump_grammar(grammar): + for name, symbols in grammar: + print(f"{name} -> {symbols}") + print() -gen = GenerateLR0('E', grammar_simple) -table = gen.gen_table() -tree = parse(table, ['id', '+', '(', 'id', ')']) -print(format_node(tree) + "\n") -print() + # OK, this is a very simple LR0 grammar. + print("grammar_simple:") + grammar_simple = [ + ("E", ["E", "+", "T"]), + ("E", ["T"]), + ("T", ["(", "E", ")"]), + ("T", ["id"]), + ] -# This one doesn't work with LR0, though, it has a shift/reduce conflict. -grammar_lr0_shift_reduce = grammar_simple + [ - ('T', ['id', '[', 'E', ']']), -] -try: - gen = GenerateLR0('E', grammar_lr0_shift_reduce) + gen = GenerateLR0("E", grammar_simple) table = gen.gen_table() - assert False -except ValueError as e: - print(e) -print() + print(format_table(gen, table)) + tree = parse(table, ["id", "+", "(", "id", ")"]) + print(format_node(tree) + "\n") + print() -# Nor does this: it has a reduce/reduce conflict. -grammar_lr0_reduce_reduce = grammar_simple + [ - ('E', ['V', '=', 'E']), - ('V', ['id']), -] -try: - gen = GenerateLR0('E', grammar_lr0_reduce_reduce) + # This one doesn't work with LR0, though, it has a shift/reduce conflict. + print("grammar_lr0_shift_reduce (LR0):") + grammar_lr0_shift_reduce = grammar_simple + [ + ("T", ["id", "[", "E", "]"]), + ] + try: + gen = GenerateLR0("E", grammar_lr0_shift_reduce) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + # Nor does this: it has a reduce/reduce conflict. + print("grammar_lr0_reduce_reduce (LR0):") + grammar_lr0_reduce_reduce = grammar_simple + [ + ("E", ["V", "=", "E"]), + ("V", ["id"]), + ] + try: + gen = GenerateLR0("E", grammar_lr0_reduce_reduce) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + # Nullable symbols just don't work with constructs like this, because you can't + # look ahead to figure out if you should reduce an empty 'F' or not. + print("grammar_nullable (LR0):") + grammar_nullable = [ + ("E", ["F", "boop"]), + ("F", ["beep"]), + ("F", []), + ] + try: + gen = GenerateLR0("E", grammar_nullable) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + print("grammar_lr0_shift_reduce (SLR1):") + dump_grammar(grammar_lr0_shift_reduce) + gen = GenerateSLR1("E", grammar_lr0_shift_reduce) + print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") table = gen.gen_table() - assert False -except ValueError as e: - print(e) -print() + print(format_table(gen, table)) + tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) + print(format_node(tree) + "\n") + print() -# Nullable symbols just don't work with constructs like this, because you can't -# look ahead to figure out if you should reduce an empty 'F' or not. -grammar_nullable = [ - ('E', ['F', 'boop']), - ('F', ['beep']), - ('F', []), -] -try: - gen = GenerateLR0('E', grammar_nullable) + # SLR1 can't handle this. + print("grammar_aho_ullman_1 (SLR1):") + grammar_aho_ullman_1 = [ + ("S", ["L", "=", "R"]), + ("S", ["R"]), + ("L", ["*", "R"]), + ("L", ["id"]), + ("R", ["L"]), + ] + try: + gen = GenerateSLR1("S", grammar_aho_ullman_1) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + # Here's an example with a full LR1 grammar, though. + print("grammar_aho_ullman_2 (LR1):") + grammar_aho_ullman_2 = [ + ("S", ["X", "X"]), + ("X", ["a", "X"]), + ("X", ["b"]), + ] + gen = GenerateLR1("S", grammar_aho_ullman_2) table = gen.gen_table() - assert False -except ValueError as e: - print(e) + print(format_table(gen, table)) + parse(table, ["b", "a", "a", "b"], trace=True) + print() -gen = GenerateSLR1('E', grammar_lr0_shift_reduce) -print("First: {first}".format(first=str(gen.gen_first(['E'])))) -print("Follow: {follow}".format(follow=str(gen.gen_follow('E')))) -table = gen.gen_table() -print(format_table(gen, table)) -tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')']) -print(format_node(tree) + "\n") -print() - -# SLR1 can't handle this. -grammar_aho_ullman_1 = [ - ('S', ['L', '=', 'R']), - ('S', ['R']), - ('L', ['*', 'R']), - ('L', ['id']), - ('R', ['L']), -] -try: - gen = GenerateSLR1('S', grammar_aho_ullman_1) + # What happens if we do LALR to it? + print("grammar_aho_ullman_2 (LALR):") + gen = GenerateLALR("S", grammar_aho_ullman_2) table = gen.gen_table() - assert False -except ValueError as e: - print(e) -print() + print(format_table(gen, table)) + print() -# Here's an example with a full LR1 grammar, though. -grammar_aho_ullman_2 = [ - ('S', ['X', 'X']), - ('X', ['a', 'X']), - ('X', ['b']), -] -gen = GenerateLR1('S', grammar_aho_ullman_2) -table = gen.gen_table() -print(format_table(gen, table)) -parse(table, ['b', 'a', 'a', 'b'], trace=True) -print() + # A fun LALAR grammar. + print("grammar_lalr:") + grammar_lalr = [ + ("S", ["V", "E"]), + ("E", ["F"]), + ("E", ["E", "+", "F"]), + ("F", ["V"]), + ("F", ["int"]), + ("F", ["(", "E", ")"]), + ("V", ["id"]), + ] + gen = GenerateLALR("S", grammar_lalr) + table = gen.gen_table() + print(format_table(gen, table)) + print() -# What happens if we do LALR to it? -gen = GenerateLALR('S', grammar_aho_ullman_2) -table = gen.gen_table() -print(format_table(gen, table)) -print() -# A fun LALAR grammar. -grammar_lalr = [ - ('S', ['V', 'E']), - - ('E', ['F']), - ('E', ['E', '+', 'F']), - - ('F', ['V']), - ('F', ['int']), - ('F', ['(', 'E', ')']), - - ('V', ['id']), -] -gen = GenerateLALR('S', grammar_lalr) -table = gen.gen_table() -print(format_table(gen, table)) -print() +if __name__ == "__main__": + examples() diff --git a/parser_faster.py b/parser_faster.py deleted file mode 100644 index 8a28c85..0000000 --- a/parser_faster.py +++ /dev/null @@ -1,1295 +0,0 @@ -"""I wanted to try to use the code in `parser.py` to do real work, and as you -might expect the code did NOT work acceptibly. - -This version has some performance work done. - -It also supports precedence. - -2023 -""" -import collections -import dataclasses -import enum -import functools -import typing - - -############################################################################### -# LR0 -# -# We start with LR0 parsers, because they form the basis of everything else. -############################################################################### -class Configuration: - """A rule being tracked in a state. - - (Note: technically, lookahead isn't used until we get to LR(1) parsers, - but if left at its default it's harmless. Ignore it until you get to - the part about LR(1).) - """ - __slots__ = ( - 'name', - 'symbols', - 'position', - 'lookahead', - 'next', - 'at_end', - '_vals', - '_hash', - ) - - name: int - symbols: typing.Tuple[int, ...] - position: int - lookahead: typing.Tuple[int, ...] - next: int | None - at_end: bool - - _vals: typing.Tuple - _hash: int - - def __init__(self, name, symbols, position, lookahead) -> None: - self.name = name - self.symbols = symbols - self.position = position - self.lookahead = lookahead - - at_end = position == len(symbols) - self.at_end = at_end - self.next = symbols[position] if not at_end else None - - self._vals = (name, symbols, position, lookahead) - self._hash = hash(self._vals) - - @classmethod - def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()): - return Configuration( - name=name, - symbols=symbols, - position=0, - lookahead=lookahead, - ) - - def __hash__(self) -> int: - return self._hash - - def __eq__(self, value: object, /) -> bool: - if value is self: - return True - if not isinstance(value, Configuration): - return NotImplemented - - return ( - value._hash == self._hash and - value.name == self.name and - value.position == self.position and - value.symbols == self.symbols and - value.lookahead == self.lookahead - ) - - def __lt__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals < value._vals - - def __gt__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals > value._vals - - def __le__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals <= value._vals - - def __ge__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals >= value._vals - - def replace_position(self, new_position): - return Configuration( - name=self.name, - symbols=self.symbols, - position=new_position, - lookahead=self.lookahead, - ) - - def clear_lookahead(self): - return Configuration( - name=self.name, - symbols=self.symbols, - position=self.position, - lookahead=(), - ) - - @property - def rest(self): - return self.symbols[(self.position+1):] - - def format(self, alphabet: list[str]) -> str: - la = ", " + str(tuple(alphabet[i] for i in self.lookahead)) if self.lookahead != () else "" - return "{name} -> {bits}{lookahead}".format( - name=alphabet[self.name], - bits=' '.join([ - '* ' + alphabet[sym] if i == self.position else alphabet[sym] - for i, sym in enumerate(self.symbols) - ]) + (' *' if self.at_end else ''), - lookahead=la, - ) - -ConfigSet = typing.Tuple[Configuration, ...] - -class ConfigurationSetInfo: - """When we build a grammar into a table, the first thing we need to do is - generate all the configuration sets and their successors. This is the - structure that tracks the result of that computation. - - (Different generators vary in the details of how they generate this - structure, but they all compute this information.) - """ - config_set_key: dict[ConfigSet, int] - sets: list[ConfigSet] - successors: list[dict[int, int]] - - def __init__(self): - self.config_set_key = {} - self.sets = [] - self.successors = [] - - def register_config_set(self, c: ConfigSet) -> typing.Tuple[int, bool]: - """Potentially add a new config set to the set of sets. Returns the - canonical ID of the set within this structure, along with a boolean - indicating whether the set was just added or not. - - (You can use this integer to get the set back, if you need it, and - also access the successors table.) - """ - existing = self.config_set_key.get(c) - if existing is not None: - return existing, False - - index = len(self.sets) - self.sets.append(c) - self.successors.append({}) - self.config_set_key[c] = index - return index, True - - def add_successor(self, c_id: int, symbol: int, successor: int): - """Register sucessor(`c_id`, `symbol`) -> `successor` - """ - self.successors[c_id][symbol] = successor - - def find_path_to_set(self, target_set: ConfigSet) -> list[int]: - target_index = self.config_set_key[target_set] - visited = set() - - queue = collections.deque() - queue.appendleft((0, [])) - while len(queue) > 0: - set_index, path = queue.pop() - if set_index == target_index: - return path - - if set_index in visited: - continue - visited.add(set_index) - - for symbol, successor in self.successors[set_index].items(): - queue.appendleft((successor, path + [symbol])) - - raise KeyError("Unable to find a path to the target set!") - - -class Assoc(enum.Enum): - """Associativity of a rule.""" - NONE = 0 - LEFT = 1 - RIGHT = 2 - - - -class ErrorCollection: - errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]] - - def __init__(self): - self.errors = {} - - def any(self) -> bool: - return len(self.errors) > 0 - - def add_error(self, config_set: ConfigSet, symbol: int, config: Configuration, action: typing.Tuple): - set_errors = self.errors.get(config_set) - if set_errors is None: - set_errors = {} - self.errors[config_set] = set_errors - - symbol_errors = set_errors.get(symbol) - if symbol_errors is None: - symbol_errors = {} - set_errors[symbol] = symbol_errors - - symbol_errors[config] = action - - - def format( - self, - alphabet: list[str], - all_sets: ConfigurationSetInfo, - ) -> str | None: - if len(self.errors) is None: - return None - - errors = [] - for config_set, set_errors in self.errors.items(): - path = all_sets.find_path_to_set(config_set) - path_str = " ".join(alphabet[s] for s in path) - - for symbol, symbol_errors in set_errors.items(): - lines = [] - lines.append(f"When we have parsed '{path_str}' and see '{alphabet[symbol]}' we don't know whether:") - for config, action in symbol_errors.items(): - name = alphabet[config.name] - rule = " ".join(f"{'* ' if config.position == i else ''}{alphabet[s]}" for i,s in enumerate(config.symbols)) - if config.next is None: - rule += " *" - - if action[0] == 'reduce': - action_str = f"pop {action[2]} values off the stack and make a {action[1]}" - elif action[0] == 'shift': - action_str = "consume the token and keep going" - elif action[0] == 'accept': - action_str = "accept the parse" - else: - assert action[0] == "goto", f"Unknown action {action[0]}" - raise Exception("Shouldn't conflict on goto ever") - - lines.append(f" - We are in the rule `{name}: {rule}` and we should {action_str}") - - errors.append("\n".join(lines)) - - return "\n\n".join(errors) - - -class TableBuilder(object): - errors: ErrorCollection - table: list[dict[str, typing.Tuple]] - alphabet: list[str] - precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] - row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]] - - def __init__(self, alphabet: list[str], precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]): - self.errors = ErrorCollection() - self.table = [] - self.alphabet = alphabet - self.precedence = precedence - self.row = None - - def flush(self, all_sets: ConfigurationSetInfo): - self._flush_row() - if self.errors.any(): - errors = self.errors.format(self.alphabet, all_sets) - raise ValueError(f"Errors building the table:\n\n{errors}") - return self.table - - def new_row(self, config_set: ConfigSet): - self._flush_row() - self.row = [(None, None) for _ in self.alphabet] - self.current_config_set = config_set - - def _flush_row(self): - if self.row: - actions = { - self.alphabet[k]: v[0] - for k, v in enumerate(self.row) - if v[0] is not None - } - self.table.append(actions) - - - def set_table_reduce(self, symbol: int, config): - action = ('reduce', self.alphabet[config.name], len(config.symbols)) - self._set_table_action(symbol, action, config) - - def set_table_accept(self, symbol: int, config: Configuration): - action = ('accept',) - self._set_table_action(symbol, action, config) - - def set_table_shift(self, symbol: int, index: int, config: Configuration): - action = ('shift', index) - self._set_table_action(symbol, action, config) - - def set_table_goto(self, symbol: int, index: int): - action = ('goto', index) - self._set_table_action(symbol, action, None) - - def _action_precedence(self, symbol, action, config): - if action[0] == 'shift': - return self.precedence[symbol] - else: - return self.precedence[config.name] - - def _set_table_action(self, symbol_id: int, action, config: Configuration|None): - """Set the action for 'symbol' in the table row to 'action'. - - This is destructive; it changes the table. It raises an error if - there is already an action for the symbol in the row. - """ - assert isinstance(symbol_id, int) - - assert self.row is not None - existing, existing_config = self.row[symbol_id] - if existing is not None and existing != action: - assert existing_config is not None - assert config is not None - - existing_assoc, existing_prec = self._action_precedence( - symbol_id, existing, existing_config) - new_assoc, new_prec = self._action_precedence( - symbol_id, action, config) - - if existing_prec > new_prec: - # Precedence of the action in the table already wins, do nothing. - return - - elif existing_prec == new_prec: - # It's an actual conflict, use associativity if we can. - # If there's a conflict in associativity then it's a real conflict! - assoc = Assoc.NONE - if existing_assoc == Assoc.NONE: - assoc = new_assoc - elif new_assoc == Assoc.NONE: - assoc = existing_assoc - elif new_assoc == existing_assoc: - assoc = new_assoc - - resolved = False - if assoc == Assoc.LEFT: - # Prefer reduce over shift - if action[0] == 'shift' and existing[0] == 'reduce': - action = existing - resolved = True - elif action[0] == 'reduce' and existing[0] == 'shift': - resolved = True - - elif assoc == Assoc.RIGHT: - # Prefer shift over reduce - if action[0] == 'shift' and existing[0] == 'reduce': - resolved = True - elif action[0] == 'reduce' and existing[0] == 'shift': - action = existing - resolved = True - - if not resolved: - # Record the conflicts. - self.errors.add_error(self.current_config_set, symbol_id, existing_config, existing) - self.errors.add_error(self.current_config_set, symbol_id, config, action) - - else: - # Precedence of the new action is greater than the existing - # action, just allow the overwrite with no change. - pass - - self.row[symbol_id] = (action, config) - - - -class GenerateLR0(object): - """Generate parser tables for an LR0 parser. - - The input grammars are of the form: - - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - - Which is to say, they are a list of productions. Each production is a - tuple where the first element of the tuple is the name of the - non-terminal being added, and the second elment of the tuple is the - list of terminals and non-terminals that make up the production. - - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. - - Don't name anything with double-underscores; those are reserved for - the generator. Don't add '$' either, as it is reserved to mean - end-of-stream. Use an empty list to indicate nullability, that is: - - ('O', []), - - means that O can be matched with nothing. - """ - - alphabet: list[str] - grammar: list[list[typing.Tuple[int, ...]]] - nonterminal: typing.Tuple[bool, ...] - terminal: typing.Tuple[bool, ...] - precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] - - symbol_key: dict[str, int] - start_symbol: int - end_symbol: int - - config_sets_key: dict[ConfigSet, int] - successors: list[set[int]] - - - def __init__( - self, - start: str, - grammar: list[typing.Tuple[str, list[str]]], - precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, - ): - """Initialize the parser generator with the specified grammar and - start symbol. - """ - - # Work out the alphabet. - alphabet = set() - for name, rule in grammar: - alphabet.add(name) - alphabet.update(symbol for symbol in rule) - - # Check to make sure they didn't use anything that will give us - # heartburn later. - reserved = [a for a in alphabet if a.startswith('__') or a == '$'] - if reserved: - raise ValueError( - "Can't use {symbols} in grammars, {what} reserved.".format( - symbols=' or '.join(reserved), - what="it's" if len(reserved) == 1 else "they're", - ) - ) - - alphabet.add('__start') - alphabet.add('$') - self.alphabet = list(sorted(alphabet)) - - symbol_key = { - symbol: index - for index, symbol in enumerate(self.alphabet) - } - - start_symbol = symbol_key['__start'] - end_symbol = symbol_key['$'] - - assert self.alphabet[start_symbol] == '__start' - assert self.alphabet[end_symbol] == '$' - - # Turn the incoming grammar into a dictionary, indexed by nonterminal. - # - # We count on python dictionaries retaining the insertion order, like - # it or not. - full_grammar = [list() for _ in self.alphabet] - terminal = [True for _ in self.alphabet] - assert terminal[end_symbol] - - nonterminal = [False for _ in self.alphabet] - - for name, rule in grammar: - name_symbol = symbol_key[name] - - terminal[name_symbol] = False - nonterminal[name_symbol] = True - - rules = full_grammar[name_symbol] - rules.append(tuple(symbol_key[symbol] for symbol in rule)) - - self.grammar = full_grammar - self.grammar[start_symbol].append((symbol_key[start],)) - terminal[start_symbol] = False - nonterminal[start_symbol] = True - - self.terminal = tuple(terminal) - self.nonterminal = tuple(nonterminal) - - assert self.terminal[end_symbol] - assert self.nonterminal[start_symbol] - - if precedence is None: - precedence = {} - self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) - - self.symbol_key = symbol_key - self.start_symbol = start_symbol - self.end_symbol = end_symbol - - @functools.cache - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for - config. - - If the position for config is just before a non-terminal, then the - next set of configurations is configurations for all of the - productions for that non-terminal, with the position at the - beginning. (If the position for config is just before a terminal, - or at the end of the production, then the next set is empty.) - """ - next = config.next - if next is None: - return () - else: - return tuple( - Configuration.from_rule(next, rule) - for rule in self.grammar[next] - ) - - def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: - """Compute the closure for the specified configs. The closure is all - of the configurations we could be in. Specifically, if the position - for a config is just before a non-terminal then we must also consider - configurations where the rule is the rule for the non-terminal and - the position is just before the beginning of the rule. - - (We have replaced a recursive version with an iterative one.) - """ - closure = set() - pending = list(seeds) - pending_next = [] - while len(pending) > 0: - for config in pending: - if config in closure: - continue - - closure.add(config) - for next_config in self.gen_closure_next(config): - pending_next.append(next_config) - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - return tuple(sorted(closure)) # TODO: Why tuple? - - def gen_successor(self, config_set: typing.Iterable[Configuration], symbol: int) -> ConfigSet: - """Compute the successor state for the given config set and the - given symbol. - - The successor represents the next state of the parser after seeing - the symbol. - """ - seeds = tuple( - config.replace_position(config.position + 1) - for config in config_set - if config.next == symbol - ) - - closure = self.gen_closure(seeds) - return closure - - def gen_all_successors(self, config_set: typing.Iterable[Configuration]) -> list[typing.Tuple[int, ConfigSet]]: - """Return all of the non-empty successors for the given config set.""" - possible = tuple(sorted({ - config.next - for config in config_set - if config.next is not None - })) - - next = [] - for symbol in possible: - successor = self.gen_successor(config_set, symbol) - if len(successor) > 0: - next.append((symbol, successor)) - - return next - - def gen_sets(self, config_set: typing.Tuple[Configuration,...]) -> ConfigurationSetInfo: - """Generate all configuration sets starting from the provided set.""" - result = ConfigurationSetInfo() - - successors = [] - pending = [config_set] - pending_next = [] - while len(pending) > 0: - for config_set in pending: - id, is_new = result.register_config_set(config_set) - if is_new: - for symbol, successor in self.gen_all_successors(config_set): - successors.append((id,symbol,successor)) - pending_next.append(successor) - - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - for id,symbol,successor in successors: - result.add_successor(id, symbol, result.config_set_key[successor]) - - return result - - def gen_all_sets(self) -> ConfigurationSetInfo: - """Generate all of the configuration sets for the grammar.""" - seeds = tuple( - Configuration.from_rule(self.start_symbol, rule) - for rule in self.grammar[self.start_symbol] - ) - initial_set = self.gen_closure(seeds) - return self.gen_sets(initial_set) - - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - configuration. - - In an LR0 parser, this is just the set of all terminals.""" - del(config) - return [index for index, value in enumerate(self.terminal) if value] - - def gen_table(self): - """Generate the parse table. - - The parse table is a list of states. The first state in the list is - the starting state. Each state is a dictionary that maps a symbol to an - action. Each action is a tuple. The first element of the tuple is a - string describing what to do: - - - 'shift': The second element of the tuple is the state - number. Consume the input and push that state onto the stack. - - - 'reduce': The second element is the name of the non-terminal being - reduced, and the third element is the number of states to remove - from the stack. Don't consume the input; just remove the specified - number of things from the stack, and then consult the table again, - this time using the new top-of-stack as the current state and the - name of the non-terminal to find out what to do. - - - 'goto': The second element is the state number to push onto the - stack. In the literature, these entries are treated distinctly from - the actions, but we mix them here because they never overlap with the - other actions. (These are always associated with non-terminals, and - the other actions are always associated with terminals.) - - - 'accept': Accept the result of the parse, it worked. - - Anything missing from the row indicates an error. - """ - config_sets = self.gen_all_sets() - builder = TableBuilder(self.alphabet, self.precedence) - - for config_set_id, config_set in enumerate(config_sets.sets): - builder.new_row(config_set) - successors = config_sets.successors[config_set_id] - - for config in config_set: - config_next = config.next - if config_next is None: - if config.name != self.start_symbol: - for a in self.gen_reduce_set(config): - builder.set_table_reduce(a, config) - else: - builder.set_table_accept(self.end_symbol, config) - - elif self.terminal[config_next]: - index = successors[config_next] - builder.set_table_shift(config_next, index, config) - - # Gotos - for symbol, index in successors.items(): - if self.nonterminal[symbol]: - builder.set_table_goto(symbol, index) - - return builder.flush(config_sets) - - -def parse(table, input, trace=False): - """Parse the input with the generated parsing table and return the - concrete syntax tree. - - The parsing table can be generated by GenerateLR0.gen_table() or by any - of the other generators below. The parsing mechanism never changes, only - the table generation mechanism. - - input is a list of tokens. Don't stick an end-of-stream marker, I'll stick - one on for you. - """ - assert '$' not in input - input = input + ['$'] - input_index = 0 - - # Our stack is a stack of tuples, where the first entry is the state number - # and the second entry is the 'value' that was generated when the state was - # pushed. - stack : list[typing.Tuple[int, typing.Any]] = [(0, None)] - while True: - current_state = stack[-1][0] - current_token = input[input_index] - - action = table[current_state].get(current_token, ('error',)) - if trace: - print("{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index:]), - action=repr(action) - )) - - if action[0] == 'accept': - return stack[-1][1] - - elif action[0] == 'reduce': - name = action[1] - size = action[2] - - value = (name, tuple(s[1] for s in stack[-size:])) - stack = stack[:-size] - - goto = table[stack[-1][0]].get(name, ('error',)) - assert goto[0] == 'goto' # Corrupt table? - stack.append((goto[1], value)) - - elif action[0] == 'shift': - stack.append((action[1], (current_token, ()))) - input_index += 1 - - elif action[0] == 'error': - raise ValueError( - 'Syntax error: unexpected symbol {sym}'.format( - sym=current_token, - ), - ) - - -############################################################################### -# SLR(1) -############################################################################### -def add_changed(items: set[int], item: int)->bool: - old_len = len(items) - items.add(item) - return old_len != len(items) - -def update_changed(items: set[int], other: set[int]) -> bool: - old_len = len(items) - items.update(other) - return old_len != len(items) - -@dataclasses.dataclass(frozen=True) -class FirstInfo: - firsts: list[set[int]] - is_epsilon: list[bool] - - @classmethod - def from_grammar( - cls, - grammar: list[list[typing.Tuple[int,...]]], - terminal: typing.Tuple[bool, ...], - ): - # Add all terminals to their own firsts - firsts = [] - for index, is_terminal in enumerate(terminal): - firsts.append(set()) - if is_terminal: - firsts[index].add(index) - - epsilons = [False for _ in terminal] - changed = True - while changed: - changed = False - for name, rules in enumerate(grammar): - f = firsts[name] - for rule in rules: - if len(rule) == 0: - changed = changed or not epsilons[name] - epsilons[name] = True - continue - - for index, symbol in enumerate(rule): - other_firsts = firsts[symbol] - changed = update_changed(f, other_firsts) or changed - - is_last = index == len(rule) - 1 - if is_last and epsilons[symbol]: - # If this is the last symbol and the last - # symbol can be empty then I can be empty - # too! :P - changed = changed or not epsilons[name] - epsilons[name] = True - - if not epsilons[symbol]: - # If we believe that there is at least one - # terminal in the first set of this - # nonterminal then I don't have to keep - # looping through the symbols in this rule. - break - - return FirstInfo(firsts=firsts, is_epsilon=epsilons) - -@dataclasses.dataclass(frozen=True) -class FollowInfo: - follows: list[set[int]] - - @classmethod - def from_grammar( - cls, - grammar: list[list[typing.Tuple[int,...]]], - terminal: typing.Tuple[bool, ...], - start_symbol: int, - end_symbol: int, - firsts: FirstInfo, - ): - follows = [set() for _ in grammar] - follows[start_symbol].add(end_symbol) - - changed = True - while changed: - changed = False - for name, rules in enumerate(grammar): - for rule in rules: - epsilon = True - prev_symbol = None - for symbol in reversed(rule): - f = follows[symbol] - if terminal[symbol]: - # This particular rule can't produce epsilon. - epsilon = False - prev_symbol = symbol - continue - - # While epsilon is still set, update the follow of - # this nonterminal with the follow of the production - # we're processing. (This also means that the follow - # of the last symbol in the production is the follow - # of the entire production, as it should be.) - if epsilon: - changed = update_changed(f, follows[name]) or changed - - # If we're not at the end of the list then the follow - # of the current symbol contains the first of the - # next symbol. - if prev_symbol is not None: - changed = update_changed(f, firsts.firsts[prev_symbol]) or changed - - # Now if there's no epsilon in this symbol there's no - # more epsilon in the rest of the sequence. - if not firsts.is_epsilon[symbol]: - epsilon = False - - prev_symbol = symbol - - return FollowInfo(follows=follows) - - - -class GenerateSLR1(GenerateLR0): - """Generate parse tables for SLR1 grammars. - - SLR1 parsers can recognize more than LR0 parsers, because they have a - little bit more information: instead of generating reduce actions for a - production on all possible inputs, as LR0 parsers do, they generate - reduce actions only for inputs that are in the 'follow' set of the - non-terminal. - - That means SLR1 parsers need to know how to generate 'follow(A)', which - means they need to know how to generate 'first(A)', which is most of the - code in this class. - """ - _firsts: FirstInfo - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal) - self._follows = FollowInfo.from_grammar( - self.grammar, - self.terminal, - self.start_symbol, - self.end_symbol, - self._firsts, - ) - - def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: - """Return the first set for a sequence of symbols. - - Build the set by combining the first sets of the symbols from left to - right as long as epsilon remains in the first set. If we reach the end - and every symbol has had epsilon, then this set also has epsilon. - - Otherwise we can stop as soon as we get to a non-epsilon first(), and - our result does not have epsilon. - """ - result = set() - for s in symbols: - result.update(self._firsts.firsts[s]) - if not self._firsts.is_epsilon[s]: - return (result, False) - - return (result, True) - - def gen_follow(self, symbol: int) -> set[int]: - """Generate the follow set for the given nonterminal. - - The follow set for a nonterminal is the set of terminals that can - follow the nonterminal in a valid sentence. The resulting set never - contains epsilon and is never empty, since we should always at least - ground out at '$', which is the end-of-stream marker. - """ - return self._follows.follows[symbol] - - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - config. - - In an SLR1 parser, this is the follow set of the config nonterminal.""" - return self.gen_follow(config.name) - - -class GenerateLR1(GenerateSLR1): - """Generate parse tables for LR1, or "canonical LR" grammars. - - LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they - are choosier about when they reduce. But unlike SLR parsers, they specify - the terminals on which they reduce by carrying a 'lookahead' terminal in - the configuration. The lookahead of a configuration is computed as the - closure of a configuration set is computed, so see gen_closure_next for - details. (Except for the start configuration, which has '$' as its - lookahead.) - """ - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - config. - - In an LR1 parser, this is the lookahead of the configuration.""" - return config.lookahead - - @functools.cache - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for - config. - - In LR1 parsers, we must compute the lookahead for the configurations - we're adding to the closure. The lookahead for the new configurations - is the first() of the rest of this config's production. If that - contains epsilon, then the lookahead *also* contains the lookahead we - already have. (This lookahead was presumably generated by the same - process, so in some sense it is a 'parent' lookahead, or a lookahead - from an upstream production in the grammar.) - - (See the documentation in GenerateLR0 for more information on how - this function fits into the whole process.) - """ - config_next = config.next - if config_next is None: - return () - else: - next = [] - for rule in self.grammar[config_next]: - lookahead, epsilon = self.gen_first(config.rest) - if epsilon: - lookahead.update(config.lookahead) - lookahead = tuple(sorted(lookahead)) - next.append(Configuration.from_rule(config_next, rule, lookahead=lookahead)) - - return tuple(sorted(next)) - - def gen_all_sets(self): - """Generate all of the configuration sets for the grammar. - - In LR1 parsers, we must remember to set the lookahead of the start - symbol to '$'. - """ - seeds = tuple( - Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) - for rule in self.grammar[self.start_symbol] - ) - initial_set = self.gen_closure(seeds) - return self.gen_sets(initial_set) - - -class GenerateLALR(GenerateLR1): - """Generate tables for LALR. - - LALR is smaller than LR(1) but bigger than SLR(1). It works by generating - the LR(1) configuration sets, but merging configuration sets which are - equal in everything but their lookaheads. This works in that it doesn't - generate any shift/reduce conflicts that weren't already in the LR(1) - grammar. It can, however, introduce new reduce/reduce conflicts, because - it does lose information. The advantage is that the number of parser - states is much much smaller in LALR than in LR(1). - - (Note that because we use immutable state everywhere this generator does - a lot of copying and allocation.) - """ - def merge_sets(self, config_set_a, config_set_b): - """Merge the two config sets, by keeping the item cores but merging - the lookahead sets for each item. - """ - assert len(config_set_a) == len(config_set_b) - merged = [] - for index, a in enumerate(config_set_a): - b = config_set_b[index] - assert a.clear_lookahead() == b.clear_lookahead() - - new_lookahead = a.lookahead + b.lookahead - new_lookahead = tuple(sorted(set(new_lookahead))) - merged.append(a.clear_lookahead()) - - return tuple(merged) - - def sets_equal(self, a, b): - a_no_la = tuple(s.clear_lookahead() for s in a) - b_no_la = tuple(s.clear_lookahead() for s in b) - return a_no_la == b_no_la - - def gen_sets(self, config_set) -> ConfigurationSetInfo: - """Recursively generate all configuration sets starting from the - provided set, and merge them with the provided set 'F'. - - The difference between this method and the one in GenerateLR0, where - this comes from, is in the part that stops recursion. In LALR we - compare for set equality *ignoring lookahead*. If we find a match, - then instead of returning F unchanged, we merge the two equal sets - and replace the set in F, returning the modified set. - """ - F = {} - successors = [] - pending = [config_set] - while len(pending) > 0: - config_set = pending.pop() - config_set_no_la = tuple(s.clear_lookahead() for s in config_set) - - existing = F.get(config_set_no_la) - if existing is not None: - F[config_set_no_la] = self.merge_sets(config_set, existing) - else: - F[config_set_no_la] = config_set - for symbol, successor in self.gen_all_successors(config_set): - successor_no_la = tuple(s.clear_lookahead() for s in successor) - successors.append((config_set_no_la, symbol, successor_no_la)) - pending.append(successor) - - # Register all the actually merged, final config sets. - result = ConfigurationSetInfo() - for config_set in F.values(): - result.register_config_set(config_set) - - # Now record all the successors that we found. Of course, the actual - # sets that wound up in the ConfigurationSetInfo don't match anything - # we found during the previous phase. - # - # *Fortunately* we recorded the no-lookahead keys in the successors - # so we can find the final sets, then look them up in the registered - # sets, and actually register the successor. - for config_set_no_la, symbol, successor_no_la in successors: - actual_config_set = F[config_set_no_la] - from_index = result.config_set_key[actual_config_set] - - actual_successor = F[successor_no_la] - to_index = result.config_set_key[actual_successor] - - result.add_successor(from_index, symbol, to_index) - - return result - - def set_without_lookahead(self, config_set: ConfigSet) -> ConfigSet: - return tuple(sorted(set(c.clear_lookahead() for c in config_set))) - - -############################################################################### -# Formatting -############################################################################### -def format_node(node): - """Print out an indented concrete syntax tree, from parse().""" - lines = [ - '{name}'.format(name=node[0]) - ] + [ - ' ' + line - for child in node[1] - for line in format_node(child).split('\n') - ] - return '\n'.join(lines) - - -def format_table(generator, table): - """Format a parser table so pretty.""" - def format_action(state, terminal): - action = state.get(terminal, ('error',)) - if action[0] == 'accept': - return 'accept' - elif action[0] == 'shift': - return 's' + str(action[1]) - elif action[0] == 'error': - return '' - elif action[0] == 'reduce': - return 'r' + str(action[1]) - - terminals = list(sorted( - generator.alphabet[i] - for i,v in enumerate(generator.terminal) - if v - )) - nonterminals = list(sorted( - generator.alphabet[i] - for i,v in enumerate(generator.nonterminal) - if v - )) - header = " | {terms} | {nts}".format( - terms=' '.join( - '{0: <6}'.format(terminal) - for terminal in terminals - ), - nts=' '.join( - '{0: <5}'.format(nt) - for nt in nonterminals - ), - ) - - lines = [ - header, - '-' * len(header), - ] + [ - "{index: <3} | {actions} | {gotos}".format( - index=i, - actions=' '.join( - '{0: <6}'.format(format_action(row, terminal)) - for terminal in terminals - ), - gotos=' '.join( - '{0: <5}'.format(row.get(nt, ('error', ''))[1]) - for nt in nonterminals - ), - ) - for i, row in enumerate(table) - ] - return '\n'.join(lines) - - -############################################################################### -# Examples -############################################################################### -def examples(): - def dump_grammar(grammar): - for name, symbols in grammar: - print(f"{name} -> {symbols}") - print() - - # OK, this is a very simple LR0 grammar. - print("grammar_simple:") - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - - gen = GenerateLR0('E', grammar_simple) - table = gen.gen_table() - print(format_table(gen, table)) - tree = parse(table, ['id', '+', '(', 'id', ')']) - print(format_node(tree) + "\n") - print() - - # This one doesn't work with LR0, though, it has a shift/reduce conflict. - print("grammar_lr0_shift_reduce (LR0):") - grammar_lr0_shift_reduce = grammar_simple + [ - ('T', ['id', '[', 'E', ']']), - ] - try: - gen = GenerateLR0('E', grammar_lr0_shift_reduce) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Nor does this: it has a reduce/reduce conflict. - print("grammar_lr0_reduce_reduce (LR0):") - grammar_lr0_reduce_reduce = grammar_simple + [ - ('E', ['V', '=', 'E']), - ('V', ['id']), - ] - try: - gen = GenerateLR0('E', grammar_lr0_reduce_reduce) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Nullable symbols just don't work with constructs like this, because you can't - # look ahead to figure out if you should reduce an empty 'F' or not. - print("grammar_nullable (LR0):") - grammar_nullable = [ - ('E', ['F', 'boop']), - ('F', ['beep']), - ('F', []), - ] - try: - gen = GenerateLR0('E', grammar_nullable) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - print("grammar_lr0_shift_reduce (SLR1):") - dump_grammar(grammar_lr0_shift_reduce) - gen = GenerateSLR1('E', grammar_lr0_shift_reduce) - first, epsilon=gen.gen_first((gen.symbol_key['E'],)) - print(f"First('E'): {str([gen.alphabet[f] for f in first])} (epsilon={epsilon})") - print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") - table = gen.gen_table() - print(format_table(gen, table)) - tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')'], trace=True) - print(format_node(tree) + "\n") - print() - - # SLR1 can't handle this. - print("grammar_aho_ullman_1 (SLR1):") - grammar_aho_ullman_1 = [ - ('S', ['L', '=', 'R']), - ('S', ['R']), - ('L', ['*', 'R']), - ('L', ['id']), - ('R', ['L']), - ] - try: - gen = GenerateSLR1('S', grammar_aho_ullman_1) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Here's an example with a full LR1 grammar, though. - print("grammar_aho_ullman_2 (LR1):") - grammar_aho_ullman_2 = [ - ('S', ['X', 'X']), - ('X', ['a', 'X']), - ('X', ['b']), - ] - gen = GenerateLR1('S', grammar_aho_ullman_2) - table = gen.gen_table() - print(format_table(gen, table)) - parse(table, ['b', 'a', 'a', 'b'], trace=True) - print() - - # What happens if we do LALR to it? - print("grammar_aho_ullman_2 (LALR):") - gen = GenerateLALR('S', grammar_aho_ullman_2) - table = gen.gen_table() - print(format_table(gen, table)) - print() - - # A fun LALAR grammar. - print("grammar_lalr:") - grammar_lalr = [ - ('S', ['V', 'E']), - - ('E', ['F']), - ('E', ['E', '+', 'F']), - - ('F', ['V']), - ('F', ['int']), - ('F', ['(', 'E', ')']), - - ('V', ['id']), - ] - gen = GenerateLALR('S', grammar_lalr) - table = gen.gen_table() - print(format_table(gen, table)) - print() - -if __name__=="__main__": - examples() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7cf2884 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "lrparsers" +descrption = "a small LR parser generator library" +authors = [ + {name = "John Doty", email = "john@d0ty.me"}, +] +classifiers = [ + "Private :: Do Not Upload", # Probably. + "License :: OSI Approved :: MIT License", +] + +[tool.black] +line-length=100 \ No newline at end of file