diff --git a/.gitignore b/.gitignore deleted file mode 100644 index ba0430d..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -__pycache__/ \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md index 50cd16d..9cecc1d 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,21 +1,674 @@ -MIT License + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 -Copyright (c) 2024 John Doty + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: + Preamble -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. + The GNU General Public License is a free, copyleft license for +software and other kinds of works. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index b449b8e..80c7dec 100644 --- a/README.md +++ b/README.md @@ -1,126 +1,18 @@ # A collection of LR parser generators, from LR0 through LALR. -This is a small helper library to generate LR parser tables. +One day I read a tweet, asking for a tool which accepted a grammar and an +input file and which then produced simple parsed output, without any kind of +in-between. (There was other ranty stuff about how none of the existing tools +really worked, but that was beside the point.) -The primary inspiration for this library is tree-sitter, which also generates -LR parsers for grammars written in a turing-complete language. Like that, we -write grammars in a language, only we do it in Python instead of JavaScript. +Upon reading the tweet, it occured to me that I didn't know how LR parsers +worked and how they were generated, except in the broadest of terms. Thus, I +set about writing this, learning as I went. -Why Python? Because Python 3 is widely pre-installed on MacOS and Linux. This -library requires nothing more than the basic standard library, and not even a -new version of it. Therefore, it turns out to be a pretty light dependency for -a rust or C++ or something kind of project. (Tree-sitter, on the other hand, -requires node, which is a far less stable and available runtime in 2024.) - -The parser tables can really be used to power anything. I prefer to make -concrete syntax trees (again, see tree-sitter), and there is no facility at all -for actions or custom ASTs or whatnot. Any such processing needs to be done by -the thing that processes the tables. - -## Making Grammars - -To get started, create a grammar that derives from the `Grammar` class. Create -one method per nonterminal, decorated with the `rule` decorator. Here's an -example: - - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') - - class SimpleGrammar(Grammar): - @rule - def expression(self): - return seq(self.expression, PLUS, self.term) | self.term - - @rule - def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID - - -## Using grammars - -TODO - -## Representation Choices - -The SimpleGrammar class might seem a little verbose compared to a dense -structure like: - - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - -or - - grammar_simple = { - 'E': [ - ['E', '+', 'T'], - ['T'], - ], - 'T': [ - ['(', 'E', ')'], - ['id'], - ], - } - - -The advantage that the class has over a table like this is that you get to have -all of your Python tools help you make sure your grammar is good, if you want -them. e.g., if you're working with an LSP or something, the members give you -autocomplete and jump-to-definition and possibly even type-checking. - -At the very least, if you mis-type the name of a nonterminal, or forget to -implement it, we will immediately raise an error that *INCLUDES THE LOCATION IN -THE SOURCE WHERE THE ERROR WAS MADE.* With tables, we can tell you that you -made a mistake but it's up to you to figure out where you did it. - -### Aside: What about a custom DSL/EBNF like thing? - -Yeah, OK, there's a rich history of writing your grammar in a domain-specific -language. YACC did it, ANTLR does it, GRMTools.... just about everybody except -Tree-Sitter does this. - -But look, I've got several reasons for not doing it. - -First, I'm lazy, and don't want to write yet another parser for my parser. What -tools should I use to write my parser generator parser? I guess I don't have my -parser generator parser yet, so probably a hand-written top down parser? Some -other python parser generator? Ugh! - -As an add-on to that, if I make my own format then I need to make tooling for -*that* too: syntax highlighters, jump to definition, the works. Yuck. An -existing language, and a format that builds on an existing language, gets me the -tooling that comes along with that language. If you can leverage that -effictively (and I think I have) then you start way ahead in terms of tooling. - -Second, this whole thing is supposed to be easy to include in an existing -project, and adding a custom compiler doesn't seem to be that. Adding two python -files seems to be about the right speed. - -Thirdly, and this is just hypothetical, it's probably pretty easy to write your -own tooling around a grammar if it's already in Python. If you want to make -railroad diagrams or EBNF pictures or whatever, all the productions are already -right there in data structures for you to process. I've tried to keep them -accessible and at least somewhat easy to work with. There's nothing that says a -DSL-based system *has* to produce unusable intermediate data- certainly there -are some tools that *try*- but with this approach the accessibility and the -ergonomics of the tool go hand in hand. - -## Some History - -The first version of this code was written as an idle exercise to learn how LR -parser table generation even worked. It was... very simple, fairly easy to -follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow -for anything but the most trivial grammar. - -As a result, when I decided I wanted to use it for a larger grammar, I found that -I just couldn't. So this has been hacked and significantly improved from that -version, now capable of building tables for nontrivial grammars. It could still -be a lot faster, but it meets my needs for now. +This code is not written to be fast, or even efficient, although it runs its +test cases fast enough. It was instead written to be easy to follow along +with, so that when I forget how all this works I can come back to the code +and read along and learn all over again. (BTW, the notes I read to learn how all this works are at http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, @@ -128,5 +20,7 @@ I started with handout 8, 'Bottom-up-parsing', and went from there. (I did eventually have to backtrack a little into handout 7, since that's where First() and Follow() are covered.) +Enjoy! + doty -May 2024 +2016-12-09 diff --git a/grammar.py b/grammar.py deleted file mode 100644 index c37405f..0000000 --- a/grammar.py +++ /dev/null @@ -1,389 +0,0 @@ -# This is an example grammar. -from parser import Assoc, Grammar, Nothing, Token, rule, seq - -ARROW = Token("Arrow") -AS = Token("As") -BAR = Token("Bar") -CLASS = Token("Class") -COLON = Token("Colon") -ELSE = Token("Else") -FOR = Token("For") -FUN = Token("Fun") -IDENTIFIER = Token("Identifier") -IF = Token("If") -IMPORT = Token("Import") -IN = Token("In") -LCURLY = Token("LeftBrace") -LET = Token("Let") -RCURLY = Token("RightBrace") -RETURN = Token("Return") -SEMICOLON = Token("Semicolon") -STRING = Token("String") -WHILE = Token("While") -EQUAL = Token("Equal") -LPAREN = Token("LeftParen") -RPAREN = Token("RightParen") -COMMA = Token("Comma") -SELF = Token("Selff") -OR = Token("Or") -IS = Token("Is") -AND = Token("And") -EQUALEQUAL = Token("EqualEqual") -BANGEQUAL = Token("BangEqual") -LESS = Token("Less") -GREATER = Token("Greater") -LESSEQUAL = Token("LessEqual") -GREATEREQUAL = Token("GreaterEqual") -PLUS = Token("Plus") -MINUS = Token("Minus") -STAR = Token("Star") -SLASH = Token("Slash") -NUMBER = Token("Number") -TRUE = Token("True") -FALSE = Token("False") -BANG = Token("Bang") -DOT = Token("Dot") -MATCH = Token("Match") -EXPORT = Token("Export") -UNDERSCORE = Token("Underscore") -NEW = Token("New") -LSQUARE = Token("LeftBracket") -RSQUARE = Token("RightBracket") - - -class FineGrammar(Grammar): - def __init__(self): - super().__init__( - precedence=[ - (Assoc.RIGHT, [EQUAL]), - (Assoc.LEFT, [OR]), - (Assoc.LEFT, [IS]), - (Assoc.LEFT, [AND]), - (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), - (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), - (Assoc.LEFT, [PLUS, MINUS]), - (Assoc.LEFT, [STAR, SLASH]), - (Assoc.LEFT, [self.primary_expression]), - (Assoc.LEFT, [LPAREN]), - (Assoc.LEFT, [DOT]), - # - # If there's a confusion about whether to make an IF - # statement or an expression, prefer the statement. - # - (Assoc.NONE, [self.if_statement]), - ] - ) - - @rule - def file(self): - return self.file_statement_list - - @rule - def file_statement_list(self): - return self.file_statement | (self.file_statement_list + self.file_statement) - - @rule - def file_statement(self): - return ( - self.import_statement | self.class_declaration | self.export_statement | self.statement - ) - - @rule - def import_statement(self): - return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) - - @rule - def class_declaration(self): - return seq(CLASS, IDENTIFIER, self.class_body) - - @rule - def class_body(self): - return seq(LCURLY, RCURLY) | seq(LCURLY, self.class_members, RCURLY) - - @rule - def class_members(self): - return self.class_member | seq(self.class_members, self.class_member) - - @rule - def class_member(self): - return self.field_declaration | self.function_declaration - - @rule - def field_declaration(self): - return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) - - # Types - @rule - def type_expression(self): - return self.alternate_type | self.type_identifier - - @rule - def alternate_type(self): - return seq(self.type_expression, BAR, self.type_identifier) - - @rule - def type_identifier(self): - return IDENTIFIER - - @rule - def export_statement(self): - return ( - seq(EXPORT, self.class_declaration) - | seq(EXPORT, self.function_declaration) - | seq(EXPORT, self.let_statement) - | seq(EXPORT, self.export_list, SEMICOLON) - ) - - @rule - def export_list(self): - return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) - - # Functions - @rule - def function_declaration(self): - return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( - FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block - ) - - @rule - def function_parameters(self): - return ( - seq(LPAREN, RPAREN) - | seq(LPAREN, self.first_parameter, RPAREN) - | seq(LPAREN, self.first_parameter, COMMA, self.parameter_list, RPAREN) - ) - - @rule - def first_parameter(self): - return SELF | self.parameter - - @rule - def parameter_list(self): - return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list) - - @rule - def parameter(self): - return seq(IDENTIFIER, COLON, self.type_expression) - - # Block - @rule - def block(self): - return ( - seq(LCURLY, RCURLY) - | seq(LCURLY, self.statement_list, RCURLY) - | seq(LCURLY, self.statement_list, self.expression, RCURLY) - ) - - @rule - def statement_list(self): - return self.statement | seq(self.statement_list, self.statement) - - @rule - def statement(self): - return ( - self.function_declaration - | self.let_statement - | self.return_statement - | self.for_statement - | self.if_statement - | self.while_statement - | self.expression_statement - ) - - @rule - def let_statement(self): - return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) - - @rule - def return_statement(self): - return seq(RETURN, self.expression, SEMICOLON) - - @rule - def for_statement(self): - return seq(FOR, self.iterator_variable, IN, self.expression, self.block) - - @rule - def iterator_variable(self): - return IDENTIFIER - - @rule - def if_statement(self): - return self.conditional_expression - - @rule - def while_statement(self): - return seq(WHILE, self.expression, self.block) - - @rule - def expression_statement(self): - return seq(self.expression, SEMICOLON) - - # Expressions - @rule - def expression(self): - return self.assignment_expression - - @rule - def assignment_expression(self): - return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression - - @rule - def or_expression(self): - return seq(self.or_expression, OR, self.is_expression) | self.is_expression - - @rule - def is_expression(self): - return seq(self.is_expression, IS, self.pattern) | self.and_expression - - @rule - def and_expression(self): - return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression - - @rule - def equality_expression(self): - return ( - seq(self.equality_expression, EQUALEQUAL, self.relation_expression) - | seq(self.equality_expression, BANGEQUAL, self.relation_expression) - | self.relation_expression - ) - - @rule - def relation_expression(self): - return ( - seq(self.relation_expression, LESS, self.additive_expression) - | seq(self.relation_expression, LESSEQUAL, self.additive_expression) - | seq(self.relation_expression, GREATER, self.additive_expression) - | seq(self.relation_expression, GREATEREQUAL, self.additive_expression) - ) - - @rule - def additive_expression(self): - return ( - seq(self.additive_expression, PLUS, self.multiplication_expression) - | seq(self.additive_expression, MINUS, self.multiplication_expression) - | self.multiplication_expression - ) - - @rule - def multiplication_expression(self): - return ( - seq(self.multiplication_expression, STAR, self.primary_expression) - | seq(self.multiplication_expression, SLASH, self.primary_expression) - | self.primary_expression - ) - - @rule - def primary_expression(self): - return ( - IDENTIFIER - | SELF - | NUMBER - | STRING - | TRUE - | FALSE - | seq(BANG, self.primary_expression) - | seq(MINUS, self.primary_expression) - | self.block - | self.conditional_expression - | self.list_constructor_expression - | self.object_constructor_expression - | self.match_expression - | seq(self.primary_expression, LPAREN, self.expression_list, RPAREN) - | seq(self.primary_expression, DOT, IDENTIFIER) - | seq(LPAREN, self.expression, RPAREN) - ) - - @rule - def conditional_expression(self): - return ( - seq(IF, self.expression, self.block) - | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) - | seq(IF, self.expression, self.block, ELSE, self.block) - ) - - @rule - def list_constructor_expression(self): - return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE) - - @rule - def expression_list(self): - return ( - self.expression - | seq(self.expression, COMMA) - | seq(self.expression, COMMA, self.expression_list) - ) - - @rule - def match_expression(self): - return seq(MATCH, self.match_body) - - @rule - def match_body(self): - return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY) - - @rule - def match_arms(self): - return ( - self.match_arm - | seq(self.match_arm, COMMA) - | seq(self.match_arm, COMMA, self.match_arms) - ) - - @rule - def match_arm(self): - return seq(self.pattern, ARROW, self.expression) - - @rule - def pattern(self): - return ( - seq(self.variable_binding, self.pattern_core, AND, self.and_expression) - | seq(self.variable_binding, self.pattern_core) - | seq(self.pattern_core, AND, self.and_expression) - | self.pattern_core - ) - - @rule - def pattern_core(self): - return self.type_expression | self.wildcard_pattern - - @rule - def wildcard_pattern(self): - return UNDERSCORE - - @rule - def variable_binding(self): - return seq(IDENTIFIER, COLON) - - @rule - def object_constructor_expression(self): - return seq(NEW, self.type_identifier, self.field_list) - - @rule - def field_list(self): - return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) - - @rule - def field_values(self): - return ( - self.field_value - | seq(self.field_value, COMMA) - | seq(self.field_value, COMMA, self.field_values) - ) - - @rule - def field_value(self): - return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) - - -grammar = FineGrammar() -table = grammar.build_table(start="file") - -print(f"{len(table)} states") - -average_entries = sum(len(row) for row in table) / len(table) -max_entries = max(len(row) for row in table) -print(f"{average_entries} average, {max_entries} max") - -# print(parser_faster.format_table(gen, table)) -# print() -# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) diff --git a/historical/parser.py b/historical/parser.py deleted file mode 100644 index 17101bd..0000000 --- a/historical/parser.py +++ /dev/null @@ -1,853 +0,0 @@ -"""A collection of LR parser generators, from LR0 through LALR. - -One day I read a tweet, asking for a tool which accepted a grammar and an -input file and which then produced simple parsed output, without any kind of -in-between. (There was other ranty stuff about how none of the existing tools -really worked, but that was beside the point.) - -Upon reading the tweet, it occured to me that I didn't know how LR parsers -worked and how they were generated, except in the broadest of terms. Thus, I -set about writing this, learning as I went. - -This code is not written to be fast, or even efficient, although it runs its -test cases fast enough. It was instead written to be easy to follow along -with, so that when I forget how all this works I can come back to the code -and read along and learn all over again. - -(BTW, the notes I read to learn how all this works are at -http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, -I started with handout 8, 'Bottom-up-parsing', and went from there. (I did -eventually have to backtrack a little into handout 7, since that's where -First() and Follow() are covered.) - -Enjoy! - -doty -2016-12-09 -""" - -from collections import namedtuple - - -############################################################################### -# LR0 -# -# We start with LR0 parsers, because they form the basis of everything else. -############################################################################### -class Configuration(namedtuple("Configuration", ["name", "symbols", "position", "lookahead"])): - """A rule being tracked in a state. - - (Note: technically, lookahead isn't used until we get to LR(1) parsers, - but if left at its default it's harmless. Ignore it until you get to - the part about LR(1).) - """ - - __slots__ = () - - @classmethod - def from_rule(cls, rule, lookahead=()): - return Configuration( - name=rule[0], - symbols=rule[1], - position=0, - lookahead=lookahead, - ) - - @property - def at_end(self): - return self.position == len(self.symbols) - - @property - def next(self): - return self.symbols[self.position] if not self.at_end else None - - @property - def rest(self): - return self.symbols[(self.position + 1) :] - - def at_symbol(self, symbol): - return self.next == symbol - - def replace(self, **kwargs): - return self._replace(**kwargs) - - def __str__(self): - la = ", " + str(self.lookahead) if self.lookahead != () else "" - return "{name} -> {bits}{lookahead}".format( - name=self.name, - bits=" ".join( - ["* " + sym if i == self.position else sym for i, sym in enumerate(self.symbols)] - ) - + (" *" if self.at_end else ""), - lookahead=la, - ) - - -class GenerateLR0(object): - """Generate parser tables for an LR0 parser. - - The input grammars are of the form: - - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - - Which is to say, they are a list of productions. Each production is a - tuple where the first element of the tuple is the name of the - non-terminal being added, and the second elment of the tuple is the - list of terminals and non-terminals that make up the production. - - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. - - Don't name anything with double-underscores; those are reserved for - the generator. Don't add '$' either, as it is reserved to mean - end-of-stream. Use an empty list to indicate nullability, that is: - - ('O', []), - - means that O can be matched with nothing. - - Implementation notes: - - This is implemented in the dumbest way possible, in order to be the - most understandable it can be. I built this to learn, and I want to - make sure I can keep learning with it. - - - We tend to use tuples everywhere. This is because tuples can be - compared for equality and put into tables and all that jazz. They might - be a little bit slower in places but like I said, this is for - learning. (Also, if we need this to run faster we can probably go a - long way by memoizing results, which is much easier if we have tuples - everywhere.) - """ - - def __init__(self, start, grammar): - """Initialize the parser generator with the specified grammar and - start symbol. - """ - # We always store the "augmented" grammar, which contains an initial - # production for the start state. grammar[0] is always the start - # rule, and in the set of states and table and whatever the first - # element is always the starting state/position. - self.grammar = [("__start", [start])] + grammar - self.nonterminals = {rule[0] for rule in grammar} - self.terminals = { - sym for name, symbols in grammar for sym in symbols if sym not in self.nonterminals - } - self.alphabet = self.terminals | self.nonterminals - - # Check to make sure they didn't use anything that will give us - # heartburn later. - reserved = [a for a in self.alphabet if a.startswith("__") or a == "$"] - if reserved: - raise ValueError( - "Can't use {symbols} in grammars, {what} reserved.".format( - symbols=" or ".join(reserved), - what="it's" if len(reserved) == 1 else "they're", - ) - ) - - self.terminals.add("$") - self.alphabet.add("$") - - def gen_closure_next(self, config): - """Return the next set of configurations in the closure for - config. - - If the position for config is just before a non-terminal, then the - next set of configurations is configurations for all of the - productions for that non-terminal, with the position at the - beginning. (If the position for config is just before a terminal, - or at the end of the production, then the next set is empty.) - """ - if config.at_end: - return () - else: - return tuple( - Configuration.from_rule(rule) for rule in self.grammar if rule[0] == config.next - ) - - def gen_closure(self, config, closure): - """Compute the closure for the specified config and unify it with the - existing closure. - - If the provided config is already in the closure then nothing is - done. (We assume that the closure of the config is *also* already in - the closure.) - """ - if config in closure: - return closure - else: - new_closure = tuple(closure) + (config,) - for next_config in self.gen_closure_next(config): - new_closure = self.gen_closure(next_config, new_closure) - return new_closure - - def gen_successor(self, config_set, symbol): - """Compute the successor state for the given config set and the - given symbol. - - The successor represents the next state of the parser after seeing - the symbol. - """ - seeds = [ - config.replace(position=config.position + 1) - for config in config_set - if config.at_symbol(symbol) - ] - - closure = () - for seed in seeds: - closure = self.gen_closure(seed, closure) - - return closure - - def gen_all_successors(self, config_set): - """Return all of the non-empty successors for the given config set.""" - next = [] - for symbol in self.alphabet: - successor = self.gen_successor(config_set, symbol) - if len(successor) > 0: - next.append(successor) - - return tuple(next) - - def gen_sets(self, config_set, F): - """Recursively generate all configuration sets starting from the - provided set, and merge them with the provided set 'F'. - """ - if config_set in F: - return F - else: - new_F = F + (config_set,) - for successor in self.gen_all_successors(config_set): - new_F = self.gen_sets(successor, new_F) - - return new_F - - def gen_all_sets(self): - """Generate all of the configuration sets for the grammar.""" - initial_set = self.gen_closure( - Configuration.from_rule(self.grammar[0]), - (), - ) - return self.gen_sets(initial_set, ()) - - def find_set_index(self, sets, set): - """Find the specified set in the set of sets, and return the - index, or None if it is not found. - """ - for i, s in enumerate(sets): - if s == set: - return i - return None - - def gen_reduce_set(self, config): - """Return the set of symbols that indicate we should reduce the given - configuration. - - In an LR0 parser, this is just the set of all terminals.""" - return self.terminals - - def gen_table(self): - """Generate the parse table. - - The parse table is a list of states. The first state in the list is - the starting state. Each state is a dictionary that maps a symbol to an - action. Each action is a tuple. The first element of the tuple is a - string describing what to do: - - - 'shift': The second element of the tuple is the state - number. Consume the input and push that state onto the stack. - - - 'reduce': The second element is the name of the non-terminal being - reduced, and the third element is the number of states to remove - from the stack. Don't consume the input; just remove the specified - number of things from the stack, and then consult the table again, - this time using the new top-of-stack as the current state and the - name of the non-terminal to find out what to do. - - - 'goto': The second element is the state number to push onto the - stack. In the literature, these entries are treated distinctly from - the actions, but we mix them here because they never overlap with the - other actions. (These are always associated with non-terminals, and - the other actions are always associated with terminals.) - - - 'accept': Accept the result of the parse, it worked. - - Anything missing from the row indicates an error. - """ - action_table = [] - config_sets = self.gen_all_sets() - for config_set in config_sets: - actions = {} - - # Actions - for config in config_set: - if config.at_end: - if config.name != "__start": - for a in self.gen_reduce_set(config): - self.set_table_action( - actions, - a, - ("reduce", config.name, len(config.symbols)), - config, - ) - else: - self.set_table_action( - actions, - "$", - ("accept",), - config, - ) - - else: - if config.next in self.terminals: - successor = self.gen_successor(config_set, config.next) - index = self.find_set_index(config_sets, successor) - self.set_table_action( - actions, - config.next, - ("shift", index), - config, - ) - - # Gotos - for symbol in self.nonterminals: - successor = self.gen_successor(config_set, symbol) - index = self.find_set_index(config_sets, successor) - if index is not None: - self.set_table_action( - actions, - symbol, - ("goto", index), - None, - ) - - # set_table_action stores the configs that generated the actions in - # the table, for diagnostic purposes. This filters them out again - # so that the parser has something clean to work with. - actions = {k: self.get_table_action(actions, k) for k in actions} - action_table.append(actions) - - return action_table - - def set_table_action(self, row, symbol, action, config): - """Set the action for 'symbol' in the table row to 'action'. - - This is destructive; it changes the table. It raises an error if - there is already an action for the symbol in the row. - """ - existing, existing_config = row.get(symbol, (None, None)) - if existing is not None and existing != action: - config_old = str(existing_config) - config_new = str(config) - max_len = max(len(config_old), len(config_new)) + 1 - error = ( - "Conflicting actions for token '{symbol}':\n" - " {config_old: <{max_len}}: {old}\n" - " {config_new: <{max_len}}: {new}\n".format( - config_old=config_old, - config_new=config_new, - max_len=max_len, - old=existing, - new=action, - symbol=symbol, - ) - ) - raise ValueError(error) - row[symbol] = (action, config) - - def get_table_action(self, row, symbol): - return row[symbol][0] - - -def parse(table, input, trace=False): - """Parse the input with the generated parsing table and return the - concrete syntax tree. - - The parsing table can be generated by GenerateLR0.gen_table() or by any - of the other generators below. The parsing mechanism never changes, only - the table generation mechanism. - - input is a list of tokens. Don't stick an end-of-stream marker, I'll stick - one on for you. - """ - assert "$" not in input - input = input + ["$"] - input_index = 0 - - # Our stack is a stack of tuples, where the first entry is the state number - # and the second entry is the 'value' that was generated when the state was - # pushed. - stack = [(0, None)] - while True: - current_state = stack[-1][0] - current_token = input[input_index] - - action = table[current_state].get(current_token, ("error",)) - if trace: - print( - "{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index:]), - action=repr(action), - ) - ) - - if action[0] == "accept": - return stack[-1][1] - - elif action[0] == "reduce": - name = action[1] - size = action[2] - - value = (name, tuple(s[1] for s in stack[-size:])) - stack = stack[:-size] - - goto = table[stack[-1][0]].get(name, ("error",)) - assert goto[0] == "goto" # Corrupt table? - stack.append((goto[1], value)) - - elif action[0] == "shift": - stack.append((action[1], (current_token, ()))) - input_index += 1 - - elif action[0] == "error": - raise ValueError( - "Syntax error: unexpected symbol {sym}".format( - sym=current_token, - ), - ) - - -############################################################################### -# SLR(1) -############################################################################### -class GenerateSLR1(GenerateLR0): - """Generate parse tables for SLR1 grammars. - - SLR1 parsers can recognize more than LR0 parsers, because they have a - little bit more information: instead of generating reduce actions for a - production on all possible inputs, as LR0 parsers do, they generate - reduce actions only for inputs that are in the 'follow' set of the - non-terminal. - - That means SLR1 parsers need to know how to generate 'follow(A)', which - means they need to know how to generate 'first(A)', which is most of the - code in this class. - """ - - def gen_first_symbol(self, symbol, visited): - """Compute the first set for a single symbol. - - If a symbol can be empty, then the set contains epsilon, which we - represent as python's `None`. - - The first set is the set of tokens that can appear as the first token - for a given symbol. (Obviously, if the symbol is itself a token, then - this is trivial.) - - 'visited' is a set of already visited symbols, to stop infinite - recursion on left-recursive grammars. That means that sometimes this - function can return an empty tuple. Don't confuse that with a tuple - containing epsilon: that's a tuple containing `None`, not an empty - tuple. - """ - if symbol in self.terminals: - return (symbol,) - elif symbol in visited: - return () - else: - assert symbol in self.nonterminals - visited.add(symbol) - - # All the firsts from all the productions. - firsts = [ - self.gen_first(rule[1], visited) for rule in self.grammar if rule[0] == symbol - ] - - result = () - for fs in firsts: - result = result + tuple(f for f in fs if f not in result) - - return tuple(sorted(result)) - - def gen_first(self, symbols, visited=None): - """Compute the first set for a sequence of symbols. - - The first set is the set of tokens that can appear as the first token - for this sequence of symbols. The interesting wrinkle in computing the - first set for a sequence of symbols is that we keep computing the first - sets so long as epsilon appears in the set. i.e., if we are computing - for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the - first set for the *sequence* also contains the first set of ['B', 'C'], - since 'A' could be missing entirely. - - An epsilon in the result is indicated by 'None'. There will always be - at least one element in the result. - - The 'visited' parameter, if not None, is a set of symbols that are - already in the process of being evaluated, to deal with left-recursive - grammars. (See gen_first_symbol for more.) - """ - if len(symbols) == 0: - return (None,) # Epsilon. - else: - if visited is None: - visited = set() - result = self.gen_first_symbol(symbols[0], visited) - if None in result: - result = tuple(s for s in result if s is not None) - result = result + self.gen_first(symbols[1:], visited) - result = tuple(sorted(set(result))) - return result - - def gen_follow(self, symbol, visited=None): - """Generate the follow set for the given nonterminal. - - The follow set for a nonterminal is the set of terminals that can - follow the nonterminal in a valid sentence. The resulting set never - contains epsilon and is never empty, since we should always at least - ground out at '$', which is the end-of-stream marker. - """ - if symbol == "__start": - return tuple("$") - - assert symbol in self.nonterminals - - # Deal with left-recursion. - if visited is None: - visited = set() - if symbol in visited: - return () - visited.add(symbol) - - follow = () - for production in self.grammar: - for index, prod_symbol in enumerate(production[1]): - if prod_symbol != symbol: - continue - - first = self.gen_first(production[1][index + 1 :]) - follow = follow + tuple(f for f in first if f is not None) - if None in first: - follow = follow + self.gen_follow(production[0], visited) - - assert None not in follow # Should always ground out at __start - return follow - - def gen_reduce_set(self, config): - """Return the set of symbols that indicate we should reduce the given - config. - - In an SLR1 parser, this is the follow set of the config nonterminal.""" - return self.gen_follow(config.name) - - -class GenerateLR1(GenerateSLR1): - """Generate parse tables for LR1, or "canonical LR" grammars. - - LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they - are choosier about when they reduce. But unlike SLR parsers, they specify - the terminals on which they reduce by carrying a 'lookahead' terminal in - the configuration. The lookahead of a configuration is computed as the - closure of a configuration set is computed, so see gen_closure_next for - details. (Except for the start configuration, which has '$' as its - lookahead.) - """ - - def gen_reduce_set(self, config): - """Return the set of symbols that indicate we should reduce the given - config. - - In an LR1 parser, this is the lookahead of the configuration.""" - return config.lookahead - - def gen_closure_next(self, config): - """Return the next set of configurations in the closure for - config. - - In LR1 parsers, we must compute the lookahead for the configurations - we're adding to the closure. The lookahead for the new configurations - is the first() of the rest of this config's production. If that - contains epsilon, then the lookahead *also* contains the lookahead we - already have. (This lookahead was presumably generated by the same - process, so in some sense it is a 'parent' lookahead, or a lookahead - from an upstream production in the grammar.) - - (See the documentation in GenerateLR0 for more information on how - this function fits into the whole process.) - """ - if config.at_end: - return () - else: - next = [] - for rule in self.grammar: - if rule[0] != config.next: - continue - - # N.B.: We can't just append config.lookahead to config.rest - # and compute first(), because lookahead is a *set*. So - # in this case we just say if 'first' contains epsilon, - # then we need to remove the epsilon and union with the - # existing lookahead. - lookahead = self.gen_first(config.rest) - if None in lookahead: - lookahead = tuple(l for l in lookahead if l is not None) - lookahead = lookahead + config.lookahead - lookahead = tuple(sorted(set(lookahead))) - next.append(Configuration.from_rule(rule, lookahead=lookahead)) - - return tuple(next) - - def gen_all_sets(self): - """Generate all of the configuration sets for the grammar. - - In LR1 parsers, we must remember to set the lookahead of the start - symbol to '$'. - """ - initial_set = self.gen_closure( - Configuration.from_rule(self.grammar[0], lookahead=("$",)), - (), - ) - return self.gen_sets(initial_set, ()) - - -class GenerateLALR(GenerateLR1): - """Generate tables for LALR. - - LALR is smaller than LR(1) but bigger than SLR(1). It works by generating - the LR(1) configuration sets, but merging configuration sets which are - equal in everything but their lookaheads. This works in that it doesn't - generate any shift/reduce conflicts that weren't already in the LR(1) - grammar. It can, however, introduce new reduce/reduce conflicts, because - it does lose information. The advantage is that the number of parser - states is much much smaller in LALR than in LR(1). - - (Note that because we use immutable state everywhere this generator does - a lot of copying and allocation.) - """ - - def merge_sets(self, config_set_a, config_set_b): - """Merge the two config sets, by keeping the item cores but merging - the lookahead sets for each item. - """ - assert len(config_set_a) == len(config_set_b) - merged = [] - for index, a in enumerate(config_set_a): - b = config_set_b[index] - assert a.replace(lookahead=()) == b.replace(lookahead=()) - - new_lookahead = a.lookahead + b.lookahead - new_lookahead = tuple(sorted(set(new_lookahead))) - merged.append(a.replace(lookahead=new_lookahead)) - - return tuple(merged) - - def sets_equal(self, a, b): - a_no_la = tuple(s.replace(lookahead=()) for s in a) - b_no_la = tuple(s.replace(lookahead=()) for s in b) - return a_no_la == b_no_la - - def gen_sets(self, config_set, F): - """Recursively generate all configuration sets starting from the - provided set, and merge them with the provided set 'F'. - - The difference between this method and the one in GenerateLR0, where - this comes from, is in the part that stops recursion. In LALR we - compare for set equality *ignoring lookahead*. If we find a match, - then instead of returning F unchanged, we merge the two equal sets - and replace the set in F, returning the modified set. - """ - config_set_no_la = tuple(s.replace(lookahead=()) for s in config_set) - for index, existing in enumerate(F): - existing_no_la = tuple(s.replace(lookahead=()) for s in existing) - if config_set_no_la == existing_no_la: - merged_set = self.merge_sets(config_set, existing) - return F[:index] + (merged_set,) + F[index + 1 :] - - # No merge candidate found, proceed. - new_F = F + (config_set,) - for successor in self.gen_all_successors(config_set): - new_F = self.gen_sets(successor, new_F) - - return new_F - - def find_set_index(self, sets, set): - """Find the specified set in the set of sets, and return the - index, or None if it is not found. - """ - for i, s in enumerate(sets): - if self.sets_equal(s, set): - return i - return None - - -############################################################################### -# Formatting -############################################################################### -def format_node(node): - """Print out an indented concrete syntax tree, from parse().""" - lines = ["{name}".format(name=node[0])] + [ - " " + line for child in node[1] for line in format_node(child).split("\n") - ] - return "\n".join(lines) - - -def format_table(generator, table): - """Format a parser table so pretty.""" - - def format_action(state, terminal): - action = state.get(terminal, ("error",)) - if action[0] == "accept": - return "accept" - elif action[0] == "shift": - return "s" + str(action[1]) - elif action[0] == "error": - return "" - elif action[0] == "reduce": - return "r" + str(action[1]) - - header = " | {terms} | {nts}".format( - terms=" ".join("{0: <6}".format(terminal) for terminal in sorted(generator.terminals)), - nts=" ".join("{0: <5}".format(nt) for nt in sorted(generator.nonterminals)), - ) - - lines = [ - header, - "-" * len(header), - ] + [ - "{index: <3} | {actions} | {gotos}".format( - index=i, - actions=" ".join( - "{0: <6}".format(format_action(row, terminal)) - for terminal in sorted(generator.terminals) - ), - gotos=" ".join( - "{0: <5}".format(row.get(nt, ("error", ""))[1]) - for nt in sorted(generator.nonterminals) - ), - ) - for i, row in enumerate(table) - ] - return "\n".join(lines) - - -############################################################################### -# Examples -############################################################################### -# OK, this is a very simple LR0 grammar. -grammar_simple = [ - ("E", ["E", "+", "T"]), - ("E", ["T"]), - ("T", ["(", "E", ")"]), - ("T", ["id"]), -] - -gen = GenerateLR0("E", grammar_simple) -table = gen.gen_table() -tree = parse(table, ["id", "+", "(", "id", ")"]) -print(format_node(tree) + "\n") -print() - -# This one doesn't work with LR0, though, it has a shift/reduce conflict. -grammar_lr0_shift_reduce = grammar_simple + [ - ("T", ["id", "[", "E", "]"]), -] -try: - gen = GenerateLR0("E", grammar_lr0_shift_reduce) - table = gen.gen_table() - assert False -except ValueError as e: - print(e) -print() - -# Nor does this: it has a reduce/reduce conflict. -grammar_lr0_reduce_reduce = grammar_simple + [ - ("E", ["V", "=", "E"]), - ("V", ["id"]), -] -try: - gen = GenerateLR0("E", grammar_lr0_reduce_reduce) - table = gen.gen_table() - assert False -except ValueError as e: - print(e) -print() - -# Nullable symbols just don't work with constructs like this, because you can't -# look ahead to figure out if you should reduce an empty 'F' or not. -grammar_nullable = [ - ("E", ["F", "boop"]), - ("F", ["beep"]), - ("F", []), -] -try: - gen = GenerateLR0("E", grammar_nullable) - table = gen.gen_table() - assert False -except ValueError as e: - print(e) - -gen = GenerateSLR1("E", grammar_lr0_shift_reduce) -print("First: {first}".format(first=str(gen.gen_first(["E"])))) -print("Follow: {follow}".format(follow=str(gen.gen_follow("E")))) -table = gen.gen_table() -print(format_table(gen, table)) -tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) -print(format_node(tree) + "\n") -print() - -# SLR1 can't handle this. -grammar_aho_ullman_1 = [ - ("S", ["L", "=", "R"]), - ("S", ["R"]), - ("L", ["*", "R"]), - ("L", ["id"]), - ("R", ["L"]), -] -try: - gen = GenerateSLR1("S", grammar_aho_ullman_1) - table = gen.gen_table() - assert False -except ValueError as e: - print(e) -print() - -# Here's an example with a full LR1 grammar, though. -grammar_aho_ullman_2 = [ - ("S", ["X", "X"]), - ("X", ["a", "X"]), - ("X", ["b"]), -] -gen = GenerateLR1("S", grammar_aho_ullman_2) -table = gen.gen_table() -print(format_table(gen, table)) -parse(table, ["b", "a", "a", "b"], trace=True) -print() - -# What happens if we do LALR to it? -gen = GenerateLALR("S", grammar_aho_ullman_2) -table = gen.gen_table() -print(format_table(gen, table)) -print() - -# A fun LALAR grammar. -grammar_lalr = [ - ("S", ["V", "E"]), - ("E", ["F"]), - ("E", ["E", "+", "F"]), - ("F", ["V"]), - ("F", ["int"]), - ("F", ["(", "E", ")"]), - ("V", ["id"]), -] -gen = GenerateLALR("S", grammar_lalr) -table = gen.gen_table() -print(format_table(gen, table)) -print() diff --git a/parser.py b/parser.py index 8091fb7..656ef09 100644 --- a/parser.py +++ b/parser.py @@ -1,124 +1,18 @@ -"""This is a small helper library to generate LR parser tables. +"""A collection of LR parser generators, from LR0 through LALR. -The primary inspiration for this library is tree-sitter, which also generates -LR parsers for grammars written in a turing-complete language. Like that, we -write grammars in a language, only we do it in Python instead of JavaScript. +One day I read a tweet, asking for a tool which accepted a grammar and an +input file and which then produced simple parsed output, without any kind of +in-between. (There was other ranty stuff about how none of the existing tools +really worked, but that was beside the point.) -Why Python? Because Python 3 is widely pre-installed on MacOS and Linux. This -library requires nothing more than the basic standard library, and not even a -new version of it. Therefore, it turns out to be a pretty light dependency for -a rust or C++ or something kind of project. (Tree-sitter, on the other hand, -requires node, which is a far less stable and available runtime in 2024.) +Upon reading the tweet, it occured to me that I didn't know how LR parsers +worked and how they were generated, except in the broadest of terms. Thus, I +set about writing this, learning as I went. -The parser tables can really be used to power anything. I prefer to make -concrete syntax trees (again, see tree-sitter), and there is no facility at all -for actions or custom ASTs or whatnot. Any such processing needs to be done by -the thing that processes the tables. - -## Making Grammars - -To get started, create a grammar that derives from the `Grammar` class. Create -one method per nonterminal, decorated with the `rule` decorator. Here's an -example: - - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') - - class SimpleGrammar(Grammar): - @rule - def expression(self): - return seq(self.expression, PLUS, self.term) | self.term - - @rule - def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID - - -## Using grammars - -TODO - -## Representation Choices - -The SimpleGrammar class might seem a little verbose compared to a dense -structure like: - - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - -or - - grammar_simple = { - 'E': [ - ['E', '+', 'T'], - ['T'], - ], - 'T': [ - ['(', 'E', ')'], - ['id'], - ], - } - - -The advantage that the class has over a table like this is that you get to have -all of your Python tools help you make sure your grammar is good, if you want -them. e.g., if you're working with an LSP or something, the members give you -autocomplete and jump-to-definition and possibly even type-checking. - -At the very least, if you mis-type the name of a nonterminal, or forget to -implement it, we will immediately raise an error that *INCLUDES THE LOCATION IN -THE SOURCE WHERE THE ERROR WAS MADE.* With tables, we can tell you that you -made a mistake but it's up to you to figure out where you did it. - -### Aside: What about a custom DSL/EBNF like thing? - -Yeah, OK, there's a rich history of writing your grammar in a domain-specific -language. YACC did it, ANTLR does it, GRMTools.... just about everybody except -Tree-Sitter does this. - -But look, I've got several reasons for not doing it. - -First, I'm lazy, and don't want to write yet another parser for my parser. What -tools should I use to write my parser generator parser? I guess I don't have my -parser generator parser yet, so probably a hand-written top down parser? Some -other python parser generator? Ugh! - -As an add-on to that, if I make my own format then I need to make tooling for -*that* too: syntax highlighters, jump to definition, the works. Yuck. An -existing language, and a format that builds on an existing language, gets me the -tooling that comes along with that language. If you can leverage that -effictively (and I think I have) then you start way ahead in terms of tooling. - -Second, this whole thing is supposed to be easy to include in an existing -project, and adding a custom compiler doesn't seem to be that. Adding two python -files seems to be about the right speed. - -Thirdly, and this is just hypothetical, it's probably pretty easy to write your -own tooling around a grammar if it's already in Python. If you want to make -railroad diagrams or EBNF pictures or whatever, all the productions are already -right there in data structures for you to process. I've tried to keep them -accessible and at least somewhat easy to work with. There's nothing that says a -DSL-based system *has* to produce unusable intermediate data- certainly there -are some tools that *try*- but with this approach the accessibility and the -ergonomics of the tool go hand in hand. - -## Some History - -The first version of this code was written as an idle exercise to learn how LR -parser table generation even worked. It was... very simple, fairly easy to -follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow -for anything but the most trivial grammar. - -As a result, when I decided I wanted to use it for a larger grammar, I found that -I just couldn't. So this has been hacked and significantly improved from that -version, now capable of building tables for nontrivial grammars. It could still -be a lot faster, but it meets my needs for now. +This code is not written to be fast, or even efficient, although it runs its +test cases fast enough. It was instead written to be easy to follow along +with, so that when I forget how all this works I can come back to the code +and read along and learn all over again. (BTW, the notes I read to learn how all this works are at http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, @@ -126,17 +20,12 @@ I started with handout 8, 'Bottom-up-parsing', and went from there. (I did eventually have to backtrack a little into handout 7, since that's where First() and Follow() are covered.) -May 2024 -""" +Enjoy! -import abc -import collections -import dataclasses -import enum -import functools -import inspect -import sys -import typing +doty +2016-12-09 +""" +from collections import namedtuple ############################################################################### @@ -144,624 +33,132 @@ import typing # # We start with LR0 parsers, because they form the basis of everything else. ############################################################################### -class Configuration: - """A rule being tracked in a state. That is, a specific position within a - specific rule, with an associated lookahead state. - - We make a *lot* of these and we need/want to pre-cache a ton of things we - ask about so we need to override __init__, otherwise it's immutable and - fixed and doesn't have a dict to save space. - - It also supports hashing and equality and comparison, so it can be sorted - and whatnot. This really is the workhorse data structure of the whole thing. - If you can improve this you can improve the performance of everything probably. +class Configuration( + namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead']) +): + """A rule being tracked in a state. (Note: technically, lookahead isn't used until we get to LR(1) parsers, but if left at its default it's harmless. Ignore it until you get to the part about LR(1).) """ - - __slots__ = ( - "name", - "symbols", - "position", - "lookahead", - "next", - "at_end", - "_vals", - "_hash", - ) - - name: int - symbols: typing.Tuple[int, ...] - position: int - lookahead: typing.Tuple[int, ...] - next: int | None - at_end: bool - - _vals: typing.Tuple - _hash: int - - def __init__(self, name, symbols, position, lookahead) -> None: - self.name = name - self.symbols = symbols - self.position = position - self.lookahead = lookahead - - at_end = position == len(symbols) - self.at_end = at_end - self.next = symbols[position] if not at_end else None - - self._vals = (name, symbols, position, lookahead) - self._hash = hash(self._vals) + __slots__ = () @classmethod - def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()): + def from_rule(cls, rule, lookahead=()): return Configuration( - name=name, - symbols=symbols, + name=rule[0], + symbols=rule[1], position=0, lookahead=lookahead, ) - def __hash__(self) -> int: - return self._hash + @property + def at_end(self): + return self.position == len(self.symbols) - def __eq__(self, value: object, /) -> bool: - if value is self: - return True - if not isinstance(value, Configuration): - return NotImplemented - - return ( - value._hash == self._hash - and value.name == self.name - and value.position == self.position - and value.symbols == self.symbols - and value.lookahead == self.lookahead - ) - - def __lt__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals < value._vals - - def __gt__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals > value._vals - - def __le__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals <= value._vals - - def __ge__(self, value) -> bool: - if not isinstance(value, Configuration): - return NotImplemented - return self._vals >= value._vals - - def replace_position(self, new_position): - return Configuration( - name=self.name, - symbols=self.symbols, - position=new_position, - lookahead=self.lookahead, - ) - - def clear_lookahead(self): - return Configuration( - name=self.name, - symbols=self.symbols, - position=self.position, - lookahead=(), - ) + @property + def next(self): + return self.symbols[self.position] if not self.at_end else None @property def rest(self): - return self.symbols[(self.position + 1) :] + return self.symbols[(self.position+1):] - def format(self, alphabet: list[str]) -> str: - la = ", " + str(tuple(alphabet[i] for i in self.lookahead)) if self.lookahead != () else "" + def at_symbol(self, symbol): + return self.next == symbol + + def replace(self, **kwargs): + return self._replace(**kwargs) + + def __str__(self): + la = ", " + str(self.lookahead) if self.lookahead != () else "" return "{name} -> {bits}{lookahead}".format( - name=alphabet[self.name], - bits=" ".join( - [ - "* " + alphabet[sym] if i == self.position else alphabet[sym] - for i, sym in enumerate(self.symbols) - ] - ) - + (" *" if self.at_end else ""), + name=self.name, + bits=' '.join([ + '* ' + sym if i == self.position else sym + for i, sym in enumerate(self.symbols) + ]) + (' *' if self.at_end else ''), lookahead=la, ) -ConfigSet = typing.Tuple[Configuration, ...] - - -class ConfigurationSetInfo: - """When we build a grammar into a table, the first thing we need to do is - generate all the configuration sets and their successors. - - (A configuration set is what it sounds like: an unordered set of - Configuration structures. But we use Tuple because it's hashable and - immutable and small and we order the Tuples so that we get repeatable - results.) - - *This* is structure that tracks the result of that computation. - - (Different generators vary in the details of how they generate this - structure, but they all compute this information.) - """ - - config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index - sets: list[ConfigSet] # Map the index back into a set - - # All the sucessors for all of the sets. `successors[i]` is the mapping - # from grammar symbol to the index of the set you get by processing that - # symbol. - successors: list[dict[int, int]] - - def __init__(self): - self.config_set_key = {} - self.sets = [] - self.successors = [] - - def register_config_set(self, c: ConfigSet) -> typing.Tuple[int, bool]: - """Potentially add a new config set to the set of sets. Returns the - canonical ID of the set within this structure, along with a boolean - indicating whether the set was just added or not. - - (You can use this integer to get the set back, if you need it, and - also access the successors table.) - """ - existing = self.config_set_key.get(c) - if existing is not None: - return existing, False - - index = len(self.sets) - self.sets.append(c) - self.successors.append({}) - self.config_set_key[c] = index - return index, True - - def add_successor(self, c_id: int, symbol: int, successor: int): - """Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id - is the id of the set in this structure, and symbol is the id of a - symbol in the alphabet of the grammar. - """ - self.successors[c_id][symbol] = successor - - def find_path_to_set(self, target_set: ConfigSet) -> list[int]: - """Trace the path of grammar symbols from the first set (which always - set 0) to the target set. This is useful in conflict reporting, - because we'll be *at* a ConfigSet and want to show the grammar symbols - that get us to where we found the conflict. - - The return value is a list of grammar symbols to get to the specified - ConfigSet. - - This function raises KeyError if no path is found. - """ - target_index = self.config_set_key[target_set] - visited = set() - - queue: collections.deque = collections.deque() - queue.appendleft((0, [])) - while len(queue) > 0: - set_index, path = queue.pop() - if set_index == target_index: - return path - - if set_index in visited: - continue - visited.add(set_index) - - for symbol, successor in self.successors[set_index].items(): - queue.appendleft((successor, path + [symbol])) - - raise KeyError("Unable to find a path to the target set!") - - -class Assoc(enum.Enum): - """Associativity of a rule.""" - - NONE = 0 - LEFT = 1 - RIGHT = 2 - - -class ErrorCollection: - """A collection of errors. The errors are grouped by config set and alphabet - symbol, so that we can group the error strings appropriately when we format - the error. - """ - - errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]] - - def __init__(self): - self.errors = {} - - def any(self) -> bool: - """Return True if there are any errors in this collection.""" - return len(self.errors) > 0 - - def add_error( - self, - config_set: ConfigSet, - symbol: int, - config: Configuration, - action: typing.Tuple, - ): - """Add an error to the collection. - - config_set is the set with the error. - symbol is the symbol we saw when we saw the error. - config is the configuration that we were in when we saw the error. - action is what we were trying to do. - - (This all makes more sense from inside the TableBuilder.) - """ - set_errors = self.errors.get(config_set) - if set_errors is None: - set_errors = {} - self.errors[config_set] = set_errors - - symbol_errors = set_errors.get(symbol) - if symbol_errors is None: - symbol_errors = {} - set_errors[symbol] = symbol_errors - - symbol_errors[config] = action - - def format( - self, - alphabet: list[str], - all_sets: ConfigurationSetInfo, - ) -> str | None: - """Format all the errors into a string, or return None if there are no - errors. - - We need the alphabet to turn all these integers into something human - readable, and all the sets to trace a path to where the errors were - encountered. - """ - if len(self.errors) is None: - return None - - errors = [] - for config_set, set_errors in self.errors.items(): - path = all_sets.find_path_to_set(config_set) - path_str = " ".join(alphabet[s] for s in path) - - for symbol, symbol_errors in set_errors.items(): - lines = [] - lines.append( - f"When we have parsed '{path_str}' and see '{alphabet[symbol]}' we don't know whether:" - ) - for config, action in symbol_errors.items(): - name = alphabet[config.name] - rule = " ".join( - f"{'* ' if config.position == i else ''}{alphabet[s]}" - for i, s in enumerate(config.symbols) - ) - if config.next is None: - rule += " *" - - if action[0] == "reduce": - action_str = f"pop {action[2]} values off the stack and make a {action[1]}" - elif action[0] == "shift": - action_str = "consume the token and keep going" - elif action[0] == "accept": - action_str = "accept the parse" - else: - assert action[0] == "goto", f"Unknown action {action[0]}" - raise Exception("Shouldn't conflict on goto ever") - - lines.append( - f" - We are in the rule `{name}: {rule}` and we should {action_str}" - ) - - errors.append("\n".join(lines)) - - return "\n\n".join(errors) - - -class TableBuilder(object): - """A helper object to assemble actions into build parse tables. - - This is a builder type thing: call `new_row` at the start of - each row, then `flush` when you're done with the last row. - """ - - errors: ErrorCollection - table: list[dict[str, typing.Tuple]] - alphabet: list[str] - precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] - row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]] - - def __init__( - self, - alphabet: list[str], - precedence: typing.Tuple[typing.Tuple[Assoc, int], ...], - ): - self.errors = ErrorCollection() - self.table = [] - self.alphabet = alphabet - self.precedence = precedence - self.row = None - - def flush(self, all_sets: ConfigurationSetInfo) -> list[dict[str, typing.Tuple]]: - """Finish building the table and return it. - - Raises ValueError if there were any conflicts during construction. - """ - self._flush_row() - if self.errors.any(): - errors = self.errors.format(self.alphabet, all_sets) - raise ValueError(f"Errors building the table:\n\n{errors}") - return self.table - - def new_row(self, config_set: ConfigSet): - """Start a new row, processing the given config set. Call this before - doing anything else. - """ - self._flush_row() - self.row = [(None, None) for _ in self.alphabet] - self.current_config_set = config_set - - def _flush_row(self): - if self.row: - actions = {self.alphabet[k]: v[0] for k, v in enumerate(self.row) if v[0] is not None} - self.table.append(actions) - - def set_table_reduce(self, symbol: int, config: Configuration): - """Mark a reduce of the given configuration for the given symbol in the - current row. - """ - action = ("reduce", self.alphabet[config.name], len(config.symbols)) - self._set_table_action(symbol, action, config) - - def set_table_accept(self, symbol: int, config: Configuration): - """Mark a accept of the given configuration for the given symbol in the - current row. - """ - action = ("accept",) - self._set_table_action(symbol, action, config) - - def set_table_shift(self, symbol: int, index: int, config: Configuration): - """Mark a shift in the current row of the given given symbol to the - given index. The configuration here provides debugging informtion for - conflicts. - """ - action = ("shift", index) - self._set_table_action(symbol, action, config) - - def set_table_goto(self, symbol: int, index: int): - """Set the goto for the given nonterminal symbol in the current row.""" - action = ("goto", index) - self._set_table_action(symbol, action, None) - - def _action_precedence(self, symbol: int, action: typing.Tuple, config: Configuration): - if action[0] == "shift": - return self.precedence[symbol] - else: - return self.precedence[config.name] - - def _set_table_action(self, symbol_id: int, action: typing.Tuple, config: Configuration | None): - """Set the action for 'symbol' in the table row to 'action'. - - This is destructive; it changes the table. It records an error if - there is already an action for the symbol in the row. - """ - assert isinstance(symbol_id, int) - - assert self.row is not None - existing, existing_config = self.row[symbol_id] - if existing is not None and existing != action: - assert existing_config is not None - assert config is not None - - existing_assoc, existing_prec = self._action_precedence( - symbol_id, existing, existing_config - ) - new_assoc, new_prec = self._action_precedence(symbol_id, action, config) - - if existing_prec > new_prec: - # Precedence of the action in the table already wins, do nothing. - return - - elif existing_prec == new_prec: - # It's an actual conflict, use associativity if we can. - # If there's a conflict in associativity then it's a real conflict! - assoc = Assoc.NONE - if existing_assoc == Assoc.NONE: - assoc = new_assoc - elif new_assoc == Assoc.NONE: - assoc = existing_assoc - elif new_assoc == existing_assoc: - assoc = new_assoc - - resolved = False - if assoc == Assoc.LEFT: - # Prefer reduce over shift - if action[0] == "shift" and existing[0] == "reduce": - action = existing - resolved = True - elif action[0] == "reduce" and existing[0] == "shift": - resolved = True - - elif assoc == Assoc.RIGHT: - # Prefer shift over reduce - if action[0] == "shift" and existing[0] == "reduce": - resolved = True - elif action[0] == "reduce" and existing[0] == "shift": - action = existing - resolved = True - - if not resolved: - # Record the conflicts. - self.errors.add_error( - self.current_config_set, symbol_id, existing_config, existing - ) - self.errors.add_error(self.current_config_set, symbol_id, config, action) - - else: - # Precedence of the new action is greater than the existing - # action, just allow the overwrite with no change. - pass - - self.row[symbol_id] = (action, config) - - class GenerateLR0(object): - """Generate parser tables for an LR0 parser.""" + """Generate parser tables for an LR0 parser. - # Internally we use integers as symbols, not strings. Mostly this is fine, - # but when we need to map back from integer to string we index this list. - alphabet: list[str] + The input grammars are of the form: - # The grammar we work with. The outer list is indexed by grammar symbol, - # terminal *and* non-terminal. The inner list is the list of productions - # for the given nonterminal symbol. (If you have a terminal `t` and look it - # up you'll just get an empty list.) - grammar: list[list[typing.Tuple[int, ...]]] + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] - # nonterminal[i] is True if alphabet[i] is a nonterminal. - nonterminal: typing.Tuple[bool, ...] - # The complement of nonterminal. terminal[i] is True if alphabet[i] is a - # terminal. - terminal: typing.Tuple[bool, ...] + Which is to say, they are a list of productions. Each production is a + tuple where the first element of the tuple is the name of the + non-terminal being added, and the second elment of the tuple is the + list of terminals and non-terminals that make up the production. - # The precedence of every symbol. If no precedence was explicitly provided - # for a symbol, then its entry in this tuple will be (NONE, 0). - precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] + There is currently no support for custom actions or alternation or + anything like that. If you want alternations that you'll have to lower + the grammar by hand into the simpler form first. - # The lookup that maps a particular symbol to an integer. (Only really used - # for debugging.) - symbol_key: dict[str, int] - # The start symbol of the grammar. - start_symbol: int - # The end symbol of the grammar. - end_symbol: int + Don't name anything with double-underscores; those are reserved for + the generator. Don't add '$' either, as it is reserved to mean + end-of-stream. Use an empty list to indicate nullability, that is: - config_sets_key: dict[ConfigSet, int] - successors: list[set[int]] + ('O', []), - def __init__( - self, - start: str, - grammar: list[typing.Tuple[str, list[str]]], - precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, - ): + means that O can be matched with nothing. + + Implementation notes: + - This is implemented in the dumbest way possible, in order to be the + most understandable it can be. I built this to learn, and I want to + make sure I can keep learning with it. + + - We tend to use tuples everywhere. This is because tuples can be + compared for equality and put into tables and all that jazz. They might + be a little bit slower in places but like I said, this is for + learning. (Also, if we need this to run faster we can probably go a + long way by memoizing results, which is much easier if we have tuples + everywhere.) + """ + def __init__(self, start, grammar): """Initialize the parser generator with the specified grammar and start symbol. - - The input grammars are of the form: - - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - - Which is to say, they are a list of productions. Each production is a - tuple where the first element of the tuple is the name of the - non-terminal being added, and the second elment of the tuple is the - list of terminals and non-terminals that make up the production. - - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. - - Don't name anything with double-underscores; those are reserved for - the generator. Don't add '$' either, as it is reserved to mean - end-of-stream. Use an empty list to indicate nullability, that is: - - ('O', []), - - means that O can be matched with nothing. - - This isn't a *great* way to author these things, but it is very simple - and flexible. You probably don't want to author this on your own; see - the Grammar class for a high-level API. - - The precedence dictionary, if provided, maps a given symbol to an - associativity and a precedence. Any symbol not in the dictionary is - presumed to have an associativity of NONE and a precedence of zero. """ - - # Work out the alphabet. - alphabet = set() - for name, rule in grammar: - alphabet.add(name) - alphabet.update(symbol for symbol in rule) + # We always store the "augmented" grammar, which contains an initial + # production for the start state. grammar[0] is always the start + # rule, and in the set of states and table and whatever the first + # element is always the starting state/position. + self.grammar = [('__start', [start])] + grammar + self.nonterminals = {rule[0] for rule in grammar} + self.terminals = { + sym + for name, symbols in grammar + for sym in symbols + if sym not in self.nonterminals + } + self.alphabet = self.terminals | self.nonterminals # Check to make sure they didn't use anything that will give us # heartburn later. - reserved = [a for a in alphabet if a.startswith("__") or a == "$"] + reserved = [a for a in self.alphabet if a.startswith('__') or a == '$'] if reserved: raise ValueError( "Can't use {symbols} in grammars, {what} reserved.".format( - symbols=" or ".join(reserved), + symbols=' or '.join(reserved), what="it's" if len(reserved) == 1 else "they're", ) ) - alphabet.add("__start") - alphabet.add("$") - self.alphabet = list(sorted(alphabet)) + self.terminals.add('$') + self.alphabet.add('$') - symbol_key = {symbol: index for index, symbol in enumerate(self.alphabet)} - - start_symbol = symbol_key["__start"] - end_symbol = symbol_key["$"] - - assert self.alphabet[start_symbol] == "__start" - assert self.alphabet[end_symbol] == "$" - - # Turn the incoming grammar into a dictionary, indexed by nonterminal. - # - # We count on python dictionaries retaining the insertion order, like - # it or not. - full_grammar: list[list] = [list() for _ in self.alphabet] - terminal: list[bool] = [True for _ in self.alphabet] - assert terminal[end_symbol] - - nonterminal = [False for _ in self.alphabet] - - for name, rule in grammar: - name_symbol = symbol_key[name] - - terminal[name_symbol] = False - nonterminal[name_symbol] = True - - rules = full_grammar[name_symbol] - rules.append(tuple(symbol_key[symbol] for symbol in rule)) - - self.grammar = full_grammar - self.grammar[start_symbol].append((symbol_key[start],)) - terminal[start_symbol] = False - nonterminal[start_symbol] = True - - self.terminal = tuple(terminal) - self.nonterminal = tuple(nonterminal) - - assert self.terminal[end_symbol] - assert self.nonterminal[start_symbol] - - if precedence is None: - precedence = {} - self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) - - self.symbol_key = symbol_key - self.start_symbol = start_symbol - self.end_symbol = end_symbol - - @functools.cache - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for config. + def gen_closure_next(self, config): + """Return the next set of configurations in the closure for + config. If the position for config is just before a non-terminal, then the next set of configurations is configurations for all of the @@ -769,117 +166,96 @@ class GenerateLR0(object): beginning. (If the position for config is just before a terminal, or at the end of the production, then the next set is empty.) """ - next = config.next - if next is None: + if config.at_end: return () else: - return tuple(Configuration.from_rule(next, rule) for rule in self.grammar[next]) + return tuple( + Configuration.from_rule(rule) + for rule in self.grammar + if rule[0] == config.next + ) - def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: - """Compute the closure for the specified configs. The closure is all - of the configurations we could be in. Specifically, if the position - for a config is just before a non-terminal then we must also consider - configurations where the rule is the rule for the non-terminal and - the position is just before the beginning of the rule. + def gen_closure(self, config, closure): + """Compute the closure for the specified config and unify it with the + existing closure. - (We have replaced a recursive version with an iterative one.) + If the provided config is already in the closure then nothing is + done. (We assume that the closure of the config is *also* already in + the closure.) """ - closure = set() - pending = list(seeds) - pending_next = [] - while len(pending) > 0: - for config in pending: - if config in closure: - continue + if config in closure: + return closure + else: + new_closure = tuple(closure) + (config,) + for next_config in self.gen_closure_next(config): + new_closure = self.gen_closure(next_config, new_closure) + return new_closure - closure.add(config) - for next_config in self.gen_closure_next(config): - pending_next.append(next_config) - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - return tuple(sorted(closure)) # TODO: Why tuple? - - def gen_successor(self, config_set: typing.Iterable[Configuration], symbol: int) -> ConfigSet: + def gen_successor(self, config_set, symbol): """Compute the successor state for the given config set and the given symbol. The successor represents the next state of the parser after seeing the symbol. """ - seeds = tuple( - config.replace_position(config.position + 1) + seeds = [ + config.replace(position=config.position + 1) for config in config_set - if config.next == symbol - ) + if config.at_symbol(symbol) + ] + + closure = () + for seed in seeds: + closure = self.gen_closure(seed, closure) - closure = self.gen_closure(seeds) return closure - def gen_all_successors( - self, config_set: typing.Iterable[Configuration] - ) -> list[typing.Tuple[int, ConfigSet]]: - """Return all of the non-empty successors for the given config set. - - (That is, given the config set, pretend we see all the symbols we - could possibly see, and figure out which configs sets we get from - those symbols. Those are the successors of this set.) - """ - possible = tuple(sorted({config.next for config in config_set if config.next is not None})) - + def gen_all_successors(self, config_set): + """Return all of the non-empty successors for the given config set.""" next = [] - for symbol in possible: + for symbol in self.alphabet: successor = self.gen_successor(config_set, symbol) if len(successor) > 0: - next.append((symbol, successor)) + next.append(successor) - return next + return tuple(next) - def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo: - """Generate all configuration sets starting from the provided set.""" - result = ConfigurationSetInfo() + def gen_sets(self, config_set, F): + """Recursively generate all configuration sets starting from the + provided set, and merge them with the provided set 'F'. + """ + if config_set in F: + return F + else: + new_F = F + (config_set,) + for successor in self.gen_all_successors(config_set): + new_F = self.gen_sets(successor, new_F) - successors = [] - pending = [config_set] - pending_next = [] - while len(pending) > 0: - for config_set in pending: - id, is_new = result.register_config_set(config_set) - if is_new: - for symbol, successor in self.gen_all_successors(config_set): - successors.append((id, symbol, successor)) - pending_next.append(successor) + return new_F - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - for id, symbol, successor in successors: - result.add_successor(id, symbol, result.config_set_key[successor]) - - return result - - def gen_all_sets(self) -> ConfigurationSetInfo: + def gen_all_sets(self): """Generate all of the configuration sets for the grammar.""" - seeds = tuple( - Configuration.from_rule(self.start_symbol, rule) - for rule in self.grammar[self.start_symbol] + initial_set = self.gen_closure( + Configuration.from_rule(self.grammar[0]), + (), ) - initial_set = self.gen_closure(seeds) - return self.gen_sets(initial_set) + return self.gen_sets(initial_set, ()) - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + def find_set_index(self, sets, set): + """Find the specified set in the set of sets, and return the + index, or None if it is not found. + """ + for i, s in enumerate(sets): + if s == set: + return i + return None + + def gen_reduce_set(self, config): """Return the set of symbols that indicate we should reduce the given configuration. - In an LR0 parser, this is just the set of all terminals. - """ - del config - return [index for index, value in enumerate(self.terminal) if value] + In an LR0 parser, this is just the set of all terminals.""" + return self.terminals def gen_table(self): """Generate the parse table. @@ -909,32 +285,89 @@ class GenerateLR0(object): Anything missing from the row indicates an error. """ + action_table = [] config_sets = self.gen_all_sets() - builder = TableBuilder(self.alphabet, self.precedence) - - for config_set_id, config_set in enumerate(config_sets.sets): - builder.new_row(config_set) - successors = config_sets.successors[config_set_id] + for config_set in config_sets: + actions = {} + # Actions for config in config_set: - config_next = config.next - if config_next is None: - if config.name != self.start_symbol: + if config.at_end: + if config.name != '__start': for a in self.gen_reduce_set(config): - builder.set_table_reduce(a, config) + self.set_table_action( + actions, + a, + ('reduce', config.name, len(config.symbols)), + config, + ) else: - builder.set_table_accept(self.end_symbol, config) + self.set_table_action( + actions, + '$', + ('accept',), + config, + ) - elif self.terminal[config_next]: - index = successors[config_next] - builder.set_table_shift(config_next, index, config) + else: + if config.next in self.terminals: + successor = self.gen_successor(config_set, config.next) + index = self.find_set_index(config_sets, successor) + self.set_table_action( + actions, + config.next, + ('shift', index), + config, + ) # Gotos - for symbol, index in successors.items(): - if self.nonterminal[symbol]: - builder.set_table_goto(symbol, index) + for symbol in self.nonterminals: + successor = self.gen_successor(config_set, symbol) + index = self.find_set_index(config_sets, successor) + if index is not None: + self.set_table_action( + actions, + symbol, + ('goto', index), + None, + ) - return builder.flush(config_sets) + # set_table_action stores the configs that generated the actions in + # the table, for diagnostic purposes. This filters them out again + # so that the parser has something clean to work with. + actions = {k: self.get_table_action(actions, k) for k in actions} + action_table.append(actions) + + return action_table + + def set_table_action(self, row, symbol, action, config): + """Set the action for 'symbol' in the table row to 'action'. + + This is destructive; it changes the table. It raises an error if + there is already an action for the symbol in the row. + """ + existing, existing_config = row.get(symbol, (None, None)) + if existing is not None and existing != action: + config_old = str(existing_config) + config_new = str(config) + max_len = max(len(config_old), len(config_new)) + 1 + error = ( + "Conflicting actions for token '{symbol}':\n" + " {config_old: <{max_len}}: {old}\n" + " {config_new: <{max_len}}: {new}\n".format( + config_old=config_old, + config_new=config_new, + max_len=max_len, + old=existing, + new=action, + symbol=symbol, + ) + ) + raise ValueError(error) + row[symbol] = (action, config) + + def get_table_action(self, row, symbol): + return row[symbol][0] def parse(table, input, trace=False): @@ -947,53 +380,48 @@ def parse(table, input, trace=False): input is a list of tokens. Don't stick an end-of-stream marker, I'll stick one on for you. - - This is not a *great* parser, it's really just a demo for what you can - do with the table. """ - assert "$" not in input - input = input + ["$"] + assert '$' not in input + input = input + ['$'] input_index = 0 # Our stack is a stack of tuples, where the first entry is the state number # and the second entry is the 'value' that was generated when the state was # pushed. - stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] + stack = [(0, None)] while True: current_state = stack[-1][0] current_token = input[input_index] - action = table[current_state].get(current_token, ("error",)) + action = table[current_state].get(current_token, ('error',)) if trace: - print( - "{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index:]), - action=repr(action), - ) - ) + print("{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index:]), + action=repr(action) + )) - if action[0] == "accept": + if action[0] == 'accept': return stack[-1][1] - elif action[0] == "reduce": + elif action[0] == 'reduce': name = action[1] size = action[2] value = (name, tuple(s[1] for s in stack[-size:])) stack = stack[:-size] - goto = table[stack[-1][0]].get(name, ("error",)) - assert goto[0] == "goto" # Corrupt table? + goto = table[stack[-1][0]].get(name, ('error',)) + assert goto[0] == 'goto' # Corrupt table? stack.append((goto[1], value)) - elif action[0] == "shift": + elif action[0] == 'shift': stack.append((action[1], (current_token, ()))) input_index += 1 - elif action[0] == "error": + elif action[0] == 'error': raise ValueError( - "Syntax error: unexpected symbol {sym}".format( + 'Syntax error: unexpected symbol {sym}'.format( sym=current_token, ), ) @@ -1002,228 +430,6 @@ def parse(table, input, trace=False): ############################################################################### # SLR(1) ############################################################################### -def update_changed(items: set[int], other: set[int]) -> bool: - """Merge the `other` set into the `items` set, and return True if this - changed the items set. - """ - old_len = len(items) - items.update(other) - return old_len != len(items) - - -@dataclasses.dataclass(frozen=True) -class FirstInfo: - """A structure that tracks the first set of a grammar. (Or, as it is - commonly styled in textbooks, FIRST.) - - firsts[s] is the set of first terminals of any particular nonterminal s. - (For a terminal , firsts[s] == s.) - - is_epsilon[s] is True if the nonterminal s can be empty, that is, if - it can match zero symbols. - - For example, consider following grammar: - - [ - ('x', ['y', 'A']), - ('y', ['z']), - ('y', ['B', 'x']), - ('y', []), - ('z', ['C']), - ('z', ['D', x]), - ] - - For this grammar, FIRST['z'] is ('C', 'D'). - - FIRST['y'] is ('B', 'C', 'D'). For the first production, 'z' is first, and - since 'z' is a nonterminal we need to include all of its symbols too, - transitively. For the second production, 'B' is first, and so that gets - added to the set. The last production doesn't have anything in it, so it - doesn't contribute to FIRST['y'], but it does set `is_epsilon` to True. - - Finally, FIRST['x'] is ('A', 'B', 'C', 'D'). ('B', 'C', 'D') comes from - FIRST['y'], as 'y' is first in our only production. But the 'A' comes from - the fact that is_epsilon['y'] is True: since 'y' can match empty input, - it is also legal for 'x' to begin with 'A'. - """ - - firsts: list[set[int]] - is_epsilon: list[bool] - - @classmethod - def from_grammar( - cls, - grammar: list[list[typing.Tuple[int, ...]]], - terminal: typing.Tuple[bool, ...], - ) -> "FirstInfo": - """Construct a new FirstInfo from the specified grammar. - - terminal[s] is True if symbol s is a terminal symbol. - """ - # Add all terminals to their own firsts - firsts: list[set[int]] = [] - for index, is_terminal in enumerate(terminal): - firsts.append(set()) - if is_terminal: - firsts[index].add(index) - - # Because we're working with recursive and mutually recursive rules, we - # need to make sure we terminate once we've actually found all the first - # symbols. Naive recursion will go forever, and recursion with a visited - # set to halt recursion ends up revisiting the same symbols over and - # over, running *very* slowly. Strangely, iteration to fixed-point turns - # out to be reasonably quick in practice, and is what every other parser - # generator uses in the end. - epsilons = [False for _ in terminal] - changed = True - while changed: - changed = False - for name, rules in enumerate(grammar): - f = firsts[name] - for rule in rules: - if len(rule) == 0: - changed = changed or not epsilons[name] - epsilons[name] = True - continue - - for index, symbol in enumerate(rule): - other_firsts = firsts[symbol] - changed = update_changed(f, other_firsts) or changed - - is_last = index == len(rule) - 1 - if is_last and epsilons[symbol]: - # If this is the last symbol and the last - # symbol can be empty then I can be empty - # too! :P - changed = changed or not epsilons[name] - epsilons[name] = True - - if not epsilons[symbol]: - # If we believe that there is at least one - # terminal in the first set of this - # nonterminal then I don't have to keep - # looping through the symbols in this rule. - break - - return FirstInfo(firsts=firsts, is_epsilon=epsilons) - - -@dataclasses.dataclass(frozen=True) -class FollowInfo: - """A structure that tracks the follow set of a grammar. (Or, again, as the - textbooks would have it, FOLLOW.) - - The follow set for a nonterminal is the set of terminals that can follow the - nonterminal in a valid sentence. The resulting set never contains epsilon - and is never empty, since we should always at least ground out at '$', which - is the end-of-stream marker. - - In order to compute follow, we need to find every place that a given - nonterminal appears in the grammar, and look at the first set of the symbol - that follows it. But if the first set of the symbol that follows it includes - epsilon, then we need to include the first of the symbol after *that*, and - so forth, until we finally either get to the end of the rule or we find some - symbol whose first doesn't include epsilon. - - If we get to the end of the rule before finding a symbol that doesn't include - epsilon, then we also need to include the follow of the nonterminal that - contains the rule itself. (Anything that follows this rule can follow the - symbol we're considering.) - - Consider this nonsense grammar: - - [ - ('s', ['x', 'A']), - - ('x', ['y', 'B']), - ('x', ['y', 'z']), - - ('y', ['x', 'C']), - - ('z', ['D']), - ('z', []), - ] - - In this grammar, FOLLOW['y'] is ('A', 'B', 'D'). 'B' comes from the first - production of 'x', that's easy. 'D' comes from the second production of 'x': - FIRST['z'] is ('D'), and so that goes into FOLLOW['y']. - - 'A' is the surprising one: it comes from the fact that FIRST['z'] contains - epsilon. Since 'z' can successfully match on empty input, we need to treat - 'y' as if it were at the end of 'x'. Anything that can follow 'x' can also - follow 'y'. Since 'A' is in FOLLOW['x'] (from the production 's'), then 'A' - is also in FOLLOW['y']. - - Note that the follow set of any nonterminal is never empty and never - contains epsilon: they all terminate at the end-of-stream marker eventually, - by construction. (The individual parser generators make sure to augment the - grammar so that this is true, and that's a main reason why they do it.) - """ - - follows: list[set[int]] - - @classmethod - def from_grammar( - cls, - grammar: list[list[typing.Tuple[int, ...]]], - terminal: typing.Tuple[bool, ...], - start_symbol: int, - end_symbol: int, - firsts: FirstInfo, - ): - follows: list[set[int]] = [set() for _ in grammar] - follows[start_symbol].add(end_symbol) - - # See the comment in FirstInfo for why this is the way it is, more or - # less. Iteration to fixed point handlily beats recursion with - # memoization. I'm as shocked and dismayed as you as you are, but it's - # nice to remember that fixed-point algorithms are good sometimes. - changed = True - while changed: - changed = False - for name, rules in enumerate(grammar): - for rule in rules: - # To do this more efficiently, we actually walk backwards - # through the rule. As long as we've still seen something - # with epsilon, then we need to add FOLLOW[name] to - # FOLLOW[symbol]. As soon as we see something *without* - # epsilon, we can stop doing that. (This is *way* more - # efficient than trying to figure out epsilon while walking - # forward.) - epsilon = True - prev_symbol = None - for symbol in reversed(rule): - f = follows[symbol] - if terminal[symbol]: - # This particular rule can't produce epsilon. - epsilon = False - prev_symbol = symbol - continue - - # While epsilon is still set, update the follow of - # this nonterminal with the follow of the production - # we're processing. (This also means that the follow - # of the last symbol in the production is the follow - # of the entire production, as it should be.) - if epsilon: - changed = update_changed(f, follows[name]) or changed - - # If we're not at the end of the list then the follow - # of the current symbol contains the first of the - # next symbol. - if prev_symbol is not None: - changed = update_changed(f, firsts.firsts[prev_symbol]) or changed - - # Now if there's no epsilon in this symbol there's no - # more epsilon in the rest of the sequence. - if not firsts.is_epsilon[symbol]: - epsilon = False - - prev_symbol = symbol - - return FollowInfo(follows=follows) - - class GenerateSLR1(GenerateLR0): """Generate parse tables for SLR1 grammars. @@ -1234,48 +440,115 @@ class GenerateSLR1(GenerateLR0): non-terminal. That means SLR1 parsers need to know how to generate 'follow(A)', which - means they need to know how to generate 'first(A)'. See FirstInfo and - FollowInfo for the details on how this is computed. + means they need to know how to generate 'first(A)', which is most of the + code in this class. """ + def gen_first_symbol(self, symbol, visited): + """Compute the first set for a single symbol. - _firsts: FirstInfo - _follows: FollowInfo + If a symbol can be empty, then the set contains epsilon, which we + represent as python's `None`. - def __init__(self, *args, **kwargs): - """See the constructor of GenerateLR0 for an explanation of the - parameters to the constructor and what they mean. + The first set is the set of tokens that can appear as the first token + for a given symbol. (Obviously, if the symbol is itself a token, then + this is trivial.) + + 'visited' is a set of already visited symbols, to stop infinite + recursion on left-recursive grammars. That means that sometimes this + function can return an empty tuple. Don't confuse that with a tuple + containing epsilon: that's a tuple containing `None`, not an empty + tuple. """ - super().__init__(*args, **kwargs) + if symbol in self.terminals: + return (symbol,) + elif symbol in visited: + return () + else: + assert symbol in self.nonterminals + visited.add(symbol) - # We store the firsts not because we need them here, but because LR1 - # and LALR need them. - self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal) - self._follows = FollowInfo.from_grammar( - self.grammar, - self.terminal, - self.start_symbol, - self.end_symbol, - self._firsts, - ) + # All the firsts from all the productions. + firsts = [ + self.gen_first(rule[1], visited) + for rule in self.grammar + if rule[0] == symbol + ] - def gen_follow(self, symbol: int) -> set[int]: + result = () + for fs in firsts: + result = result + tuple(f for f in fs if f not in result) + + return tuple(sorted(result)) + + def gen_first(self, symbols, visited=None): + """Compute the first set for a sequence of symbols. + + The first set is the set of tokens that can appear as the first token + for this sequence of symbols. The interesting wrinkle in computing the + first set for a sequence of symbols is that we keep computing the first + sets so long as epsilon appears in the set. i.e., if we are computing + for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the + first set for the *sequence* also contains the first set of ['B', 'C'], + since 'A' could be missing entirely. + + An epsilon in the result is indicated by 'None'. There will always be + at least one element in the result. + + The 'visited' parameter, if not None, is a set of symbols that are + already in the process of being evaluated, to deal with left-recursive + grammars. (See gen_first_symbol for more.) + """ + if len(symbols) == 0: + return (None,) # Epsilon. + else: + if visited is None: + visited = set() + result = self.gen_first_symbol(symbols[0], visited) + if None in result: + result = tuple(s for s in result if s is not None) + result = result + self.gen_first(symbols[1:], visited) + result = tuple(sorted(set(result))) + return result + + def gen_follow(self, symbol, visited=None): """Generate the follow set for the given nonterminal. The follow set for a nonterminal is the set of terminals that can follow the nonterminal in a valid sentence. The resulting set never contains epsilon and is never empty, since we should always at least ground out at '$', which is the end-of-stream marker. - - See FollowInfo for more information on how this is determined. """ - return self._follows.follows[symbol] + if symbol == '__start': + return tuple('$') - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + assert symbol in self.nonterminals + + # Deal with left-recursion. + if visited is None: + visited = set() + if symbol in visited: + return () + visited.add(symbol) + + follow = () + for production in self.grammar: + for index, prod_symbol in enumerate(production[1]): + if prod_symbol != symbol: + continue + + first = self.gen_first(production[1][index+1:]) + follow = follow + tuple(f for f in first if f is not None) + if None in first: + follow = follow + self.gen_follow(production[0], visited) + + assert None not in follow # Should always ground out at __start + return follow + + def gen_reduce_set(self, config): """Return the set of symbols that indicate we should reduce the given config. - In an SLR1 parser, this is the follow set of the config nonterminal. - """ + In an SLR1 parser, this is the follow set of the config nonterminal.""" return self.gen_follow(config.name) @@ -1290,39 +563,16 @@ class GenerateLR1(GenerateSLR1): details. (Except for the start configuration, which has '$' as its lookahead.) """ - - def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: - """Return the first set for a *sequence* of symbols. - - (This is more than FIRST: we need to know the first thing that can - happen in this particular sequence right here.) - - Build the set by combining the first sets of the symbols from left to - right as long as epsilon remains in the first set. If we reach the end - and every symbol has had epsilon, then this set also has epsilon. - - Otherwise we can stop as soon as we get to a non-epsilon first(), and - our result does not have epsilon. - """ - result = set() - for s in symbols: - result.update(self._firsts.firsts[s]) - if not self._firsts.is_epsilon[s]: - return (result, False) - - return (result, True) - - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + def gen_reduce_set(self, config): """Return the set of symbols that indicate we should reduce the given config. - In an LR1 parser, this is the lookahead of the configuration. - """ + In an LR1 parser, this is the lookahead of the configuration.""" return config.lookahead - @functools.cache - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for config. + def gen_closure_next(self, config): + """Return the next set of configurations in the closure for + config. In LR1 parsers, we must compute the lookahead for the configurations we're adding to the closure. The lookahead for the new configurations @@ -1333,21 +583,29 @@ class GenerateLR1(GenerateSLR1): from an upstream production in the grammar.) (See the documentation in GenerateLR0 for more information on how - this function fits into the whole process, specifically `gen_closure`.) + this function fits into the whole process.) """ - config_next = config.next - if config_next is None: + if config.at_end: return () else: next = [] - for rule in self.grammar[config_next]: - lookahead, epsilon = self.gen_first(config.rest) - if epsilon: - lookahead.update(config.lookahead) - lookahead_tuple = tuple(sorted(lookahead)) - next.append(Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)) + for rule in self.grammar: + if rule[0] != config.next: + continue - return tuple(sorted(next)) + # N.B.: We can't just append config.lookahead to config.rest + # and compute first(), because lookahead is a *set*. So + # in this case we just say if 'first' contains epsilon, + # then we need to remove the epsilon and union with the + # existing lookahead. + lookahead = self.gen_first(config.rest) + if None in lookahead: + lookahead = tuple(l for l in lookahead if l is not None) + lookahead = lookahead + config.lookahead + lookahead = tuple(sorted(set(lookahead))) + next.append(Configuration.from_rule(rule, lookahead=lookahead)) + + return tuple(next) def gen_all_sets(self): """Generate all of the configuration sets for the grammar. @@ -1355,12 +613,11 @@ class GenerateLR1(GenerateSLR1): In LR1 parsers, we must remember to set the lookahead of the start symbol to '$'. """ - seeds = tuple( - Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) - for rule in self.grammar[self.start_symbol] + initial_set = self.gen_closure( + Configuration.from_rule(self.grammar[0], lookahead=('$',)), + (), ) - initial_set = self.gen_closure(seeds) - return self.gen_sets(initial_set) + return self.gen_sets(initial_set, ()) class GenerateLALR(GenerateLR1): @@ -1374,14 +631,9 @@ class GenerateLALR(GenerateLR1): it does lose information. The advantage is that the number of parser states is much much smaller in LALR than in LR(1). - If you can get away with generating LALR tables for a grammar than you - should do it. - (Note that because we use immutable state everywhere this generator does - a lot of copying and allocation. This particular generator could still - use a bunch of improvement, probably.) + a lot of copying and allocation.) """ - def merge_sets(self, config_set_a, config_set_b): """Merge the two config sets, by keeping the item cores but merging the lookahead sets for each item. @@ -1390,20 +642,20 @@ class GenerateLALR(GenerateLR1): merged = [] for index, a in enumerate(config_set_a): b = config_set_b[index] - assert a.clear_lookahead() == b.clear_lookahead() + assert a.replace(lookahead=()) == b.replace(lookahead=()) new_lookahead = a.lookahead + b.lookahead new_lookahead = tuple(sorted(set(new_lookahead))) - merged.append(a.clear_lookahead()) + merged.append(a.replace(lookahead=new_lookahead)) return tuple(merged) def sets_equal(self, a, b): - a_no_la = tuple(s.clear_lookahead() for s in a) - b_no_la = tuple(s.clear_lookahead() for s in b) + a_no_la = tuple(s.replace(lookahead=()) for s in a) + b_no_la = tuple(s.replace(lookahead=()) for s in b) return a_no_la == b_no_la - def gen_sets(self, config_set) -> ConfigurationSetInfo: + def gen_sets(self, config_set, F): """Recursively generate all configuration sets starting from the provided set, and merge them with the provided set 'F'. @@ -1413,331 +665,28 @@ class GenerateLALR(GenerateLR1): then instead of returning F unchanged, we merge the two equal sets and replace the set in F, returning the modified set. """ - F = {} - successors = [] - pending = [config_set] - while len(pending) > 0: - config_set = pending.pop() - config_set_no_la = tuple(s.clear_lookahead() for s in config_set) + config_set_no_la = tuple(s.replace(lookahead=()) for s in config_set) + for index, existing in enumerate(F): + existing_no_la = tuple(s.replace(lookahead=()) for s in existing) + if config_set_no_la == existing_no_la: + merged_set = self.merge_sets(config_set, existing) + return F[:index] + (merged_set,) + F[index+1:] - existing = F.get(config_set_no_la) - if existing is not None: - F[config_set_no_la] = self.merge_sets(config_set, existing) - else: - F[config_set_no_la] = config_set - for symbol, successor in self.gen_all_successors(config_set): - successor_no_la = tuple(s.clear_lookahead() for s in successor) - successors.append((config_set_no_la, symbol, successor_no_la)) - pending.append(successor) + # No merge candidate found, proceed. + new_F = F + (config_set,) + for successor in self.gen_all_successors(config_set): + new_F = self.gen_sets(successor, new_F) - # Register all the actually merged, final config sets. - result = ConfigurationSetInfo() - for config_set in F.values(): - result.register_config_set(config_set) + return new_F - # Now record all the successors that we found. Of course, the actual - # sets that wound up in the ConfigurationSetInfo don't match anything - # we found during the previous phase. - # - # *Fortunately* we recorded the no-lookahead keys in the successors - # so we can find the final sets, then look them up in the registered - # sets, and actually register the successor. - for config_set_no_la, symbol, successor_no_la in successors: - actual_config_set = F[config_set_no_la] - from_index = result.config_set_key[actual_config_set] - - actual_successor = F[successor_no_la] - to_index = result.config_set_key[actual_successor] - - result.add_successor(from_index, symbol, to_index) - - return result - - -############################################################################### -# Sugar for constructing grammars -############################################################################### -# This is the "high level" API for constructing grammars. -class Rule: - """A token (terminal), production (nonterminal), or some other - combination thereof. Rules are composed and then flattened into - productions. - """ - - def __or__(self, other) -> "Rule": - return AlternativeRule(self, other) - - def __add__(self, other) -> "Rule": - return SequenceRule(self, other) - - @abc.abstractmethod - def flatten(self) -> typing.Generator[list["str | Token"], None, None]: - """Convert this potentially nested and branching set of rules into a - series of nice, flat symbol lists. - - e.g., if this rule is (X + (A | (B + C | D))) then flattening will - yield something like: - - ["X", "A"] - ["X", "B", "C"] - ["X", "B", "D"] - - Isn't that nice? - - Note that Token rules remain unchanged in the result: this is so we - can better distinguish terminals from nonterminals while processing - the grammar. + def find_set_index(self, sets, set): + """Find the specified set in the set of sets, and return the + index, or None if it is not found. """ - raise NotImplementedError() - - -class Token(Rule): - """A token, or terminal symbol in the grammar.""" - - value: str - - def __init__(self, value): - self.value = sys.intern(value) - - def flatten(self) -> typing.Generator[list[str], None, None]: - # We are just ourselves when flattened. - yield [self] - - -class NonTerminal(Rule): - """A non-terminal, or a production, in the grammar. - - You probably don't want to create this directly; instead you probably want - to use the `@rule` decorator to associate this with a function in your - grammar class. - """ - - def __init__(self, fn: typing.Callable[["Grammar"], Rule], name: str | None = None): - """Create a new NonTerminal. - - `fn` is the function that will yield the `Rule` which is the - right-hand-side of this production; it will be flattened with `flatten`. - `name` is the name of the production- if unspecified (or `None`) it will - be replaced with the `__name__` of the provided fn. - """ - self.fn = fn - self.name = name or fn.__name__ - - def generate_body(self, grammar) -> list[list[str | Token]]: - """Generate the body of the non-terminal. - - We do this by first calling the associated function in order to get a - Rule, and then flattening the Rule into the associated set of - productions. - """ - return [rule for rule in self.fn(grammar).flatten()] - - def flatten(self) -> typing.Generator[list[str | Token], None, None]: - # Although we contain multitudes, when flattened we're being asked in - # the context of some other production. Yield ourselves, and trust that - # in time we will be asked to generate our body. - yield [self.name] - - -class AlternativeRule(Rule): - """A rule that matches if one or another rule matches.""" - - def __init__(self, left: Rule, right: Rule): - self.left = left - self.right = right - - def flatten(self) -> typing.Generator[list[str], None, None]: - # All the things from the left of the alternative, then all the things - # from the right, never intermingled. - yield from self.left.flatten() - yield from self.right.flatten() - - -class SequenceRule(Rule): - """A rule that matches if a first part matches, followed by a second part. - Two things in order. - """ - - def __init__(self, first: Rule, second: Rule): - self.first = first - self.second = second - - def flatten(self) -> typing.Generator[list[str], None, None]: - # All the things in the prefix.... - for first in self.first.flatten(): - # ...potentially followed by all the things in the suffix. - for second in self.second.flatten(): - yield first + second - - -class NothingRule(Rule): - """A rule that matches no input. Nothing, the void. Don't make a new one of - these, you're probably better off just using the singleton `Nothing`. - """ - - def flatten(self) -> typing.Generator[list[str], None, None]: - # It's quiet in here. - yield [] - - -Nothing = NothingRule() - - -def seq(*args: list[Rule]) -> Rule: - """A rule that matches a sequence of rules. - - (A helper function that combines its arguments into nested sequences.) - """ - result = args[0] - for rule in args[1:]: - result = SequenceRule(result, rule) - return result - - -@typing.overload -def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... - - -@typing.overload -def rule(fn: typing.Callable) -> Rule: ... - - -def rule( - name_or_fn: None | str | typing.Callable = None, -) -> Rule | typing.Callable[[typing.Callable], Rule]: - """The decorator that marks a method in a Grammar object as a nonterminal - rule. - - As with all the best decorators, it can be called with or without arguments. - If called with one argument, that argument is a name that overrides the name - of the nonterminal, which defaults to the name of the function. - """ - - def _rule(callable): - return NonTerminal(callable, name) - - if callable(name_or_fn): - name = name_or_fn.__name__ - return _rule(name_or_fn) - else: - name = name_or_fn - return _rule - - -class Grammar: - """The base class for defining a grammar. - - Inherit from this, and and define members for your nonterminals, and then - use the `build_tables` method to construct the parse tables. - - - Here's an example of a simple grammar: - - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') - - class SimpleGrammar(Grammar): - @rule - def expression(self): - return seq(self.expression, PLUS, self.term) | self.term - - @rule - def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID - - Not very exciting, perhaps, but it's something. - """ - - def __init__(self, precedence: list[typing.Tuple[Assoc, list[Token | NonTerminal]]] = None): - if precedence is None: - precedence = getattr(self, "precedence", []) - - precedence_table = {} - for precedence, (associativity, symbols) in enumerate(precedence): - for symbol in symbols: - if isinstance(symbol, Token): - key = symbol.value - elif isinstance(symbol, NonTerminal): - key = symbol.name - else: - raise ValueError(f"{symbol} must be either a Token or a NonTerminal") - - precedence_table[key] = (associativity, precedence + 1) - - self._precedence = precedence_table - - def generate_nonterminal_dict(self, start: str) -> dict[str, list[list[str | Token]]]: - """Convert the rules into a dictionary of productions. - - Our table generators work on a very flat set of productions. This is the - first step in flattening the productions from the members: walk the rules - starting from the given start rule and flatten them, one by one, into a - dictionary that maps nonterminal rule name to its associated list of - productions. - """ - rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)) - nonterminals = {rule.name: rule for _, rule in rules} - - grammar = {} - - rule = nonterminals.get(start) - if rule is None: - raise ValueError(f"Cannot find a rule named '{start}'") - queue = [rule] - while len(queue) > 0: - rule = queue.pop() - if rule.name in grammar: - continue - - body = rule.generate_body(self) - for clause in body: - for symbol in clause: - if not isinstance(symbol, Token): - assert isinstance(symbol, str) - nonterminal = nonterminals.get(symbol) - if nonterminal is None: - raise ValueError(f"While processing {rule.name}: cannot find {symbol}") - queue.append(nonterminal) - - grammar[rule.name] = body - - return grammar - - def desugar(self, start: str) -> list[typing.Tuple[str, list[str]]]: - """Convert the rules into a flat list of productions. - - Our table generators work from a very flat set of productions. The form - produced by this function is one level flatter than the one produced by - generate_nonterminal_dict- less useful to people, probably, but it is - the input form needed by the Generator. - """ - temp_grammar = self.generate_nonterminal_dict(start) - - grammar = [] - for rule_name, clauses in temp_grammar.items(): - for clause in clauses: - new_clause = [] - for symbol in clause: - if isinstance(symbol, Token): - new_clause.append(symbol.value) - else: - new_clause.append(symbol) - - grammar.append((rule_name, new_clause)) - - return grammar - - def build_table(self, start: str, generator=GenerateLALR): - """Construct a parse table for this grammar, starting at the named - nonterminal rule. - """ - desugared = self.desugar(start) - - gen = generator(start, desugared, precedence=self._precedence) - table = gen.gen_table() - return table + for i, s in enumerate(sets): + if self.sets_equal(s, set): + return i + return None ############################################################################### @@ -1745,182 +694,173 @@ class Grammar: ############################################################################### def format_node(node): """Print out an indented concrete syntax tree, from parse().""" - lines = ["{name}".format(name=node[0])] + [ - " " + line for child in node[1] for line in format_node(child).split("\n") + lines = [ + '{name}'.format(name=node[0]) + ] + [ + ' ' + line + for child in node[1] + for line in format_node(child).split('\n') ] - return "\n".join(lines) + return '\n'.join(lines) def format_table(generator, table): """Format a parser table so pretty.""" - def format_action(state, terminal): - action = state.get(terminal, ("error",)) - if action[0] == "accept": - return "accept" - elif action[0] == "shift": - return "s" + str(action[1]) - elif action[0] == "error": - return "" - elif action[0] == "reduce": - return "r" + str(action[1]) + action = state.get(terminal, ('error',)) + if action[0] == 'accept': + return 'accept' + elif action[0] == 'shift': + return 's' + str(action[1]) + elif action[0] == 'error': + return '' + elif action[0] == 'reduce': + return 'r' + str(action[1]) - terminals = list(sorted(generator.alphabet[i] for i, v in enumerate(generator.terminal) if v)) - nonterminals = list( - sorted(generator.alphabet[i] for i, v in enumerate(generator.nonterminal) if v) - ) header = " | {terms} | {nts}".format( - terms=" ".join("{0: <6}".format(terminal) for terminal in terminals), - nts=" ".join("{0: <5}".format(nt) for nt in nonterminals), + terms=' '.join( + '{0: <6}'.format(terminal) + for terminal in sorted(generator.terminals) + ), + nts=' '.join( + '{0: <5}'.format(nt) + for nt in sorted(generator.nonterminals) + ), ) lines = [ header, - "-" * len(header), + '-' * len(header), ] + [ "{index: <3} | {actions} | {gotos}".format( index=i, - actions=" ".join( - "{0: <6}".format(format_action(row, terminal)) for terminal in terminals + actions=' '.join( + '{0: <6}'.format(format_action(row, terminal)) + for terminal in sorted(generator.terminals) + ), + gotos=' '.join( + '{0: <5}'.format(row.get(nt, ('error', ''))[1]) + for nt in sorted(generator.nonterminals) ), - gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals), ) for i, row in enumerate(table) ] - return "\n".join(lines) + return '\n'.join(lines) ############################################################################### # Examples ############################################################################### -def examples(): - def dump_grammar(grammar): - for name, symbols in grammar: - print(f"{name} -> {symbols}") - print() +# OK, this is a very simple LR0 grammar. +grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), +] - # OK, this is a very simple LR0 grammar. - print("grammar_simple:") - grammar_simple = [ - ("E", ["E", "+", "T"]), - ("E", ["T"]), - ("T", ["(", "E", ")"]), - ("T", ["id"]), - ] +gen = GenerateLR0('E', grammar_simple) +table = gen.gen_table() +tree = parse(table, ['id', '+', '(', 'id', ')']) +print(format_node(tree) + "\n") +print() - gen = GenerateLR0("E", grammar_simple) +# This one doesn't work with LR0, though, it has a shift/reduce conflict. +grammar_lr0_shift_reduce = grammar_simple + [ + ('T', ['id', '[', 'E', ']']), +] +try: + gen = GenerateLR0('E', grammar_lr0_shift_reduce) table = gen.gen_table() - print(format_table(gen, table)) - tree = parse(table, ["id", "+", "(", "id", ")"]) - print(format_node(tree) + "\n") - print() + assert False +except ValueError as e: + print(e) +print() - # This one doesn't work with LR0, though, it has a shift/reduce conflict. - print("grammar_lr0_shift_reduce (LR0):") - grammar_lr0_shift_reduce = grammar_simple + [ - ("T", ["id", "[", "E", "]"]), - ] - try: - gen = GenerateLR0("E", grammar_lr0_shift_reduce) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Nor does this: it has a reduce/reduce conflict. - print("grammar_lr0_reduce_reduce (LR0):") - grammar_lr0_reduce_reduce = grammar_simple + [ - ("E", ["V", "=", "E"]), - ("V", ["id"]), - ] - try: - gen = GenerateLR0("E", grammar_lr0_reduce_reduce) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Nullable symbols just don't work with constructs like this, because you can't - # look ahead to figure out if you should reduce an empty 'F' or not. - print("grammar_nullable (LR0):") - grammar_nullable = [ - ("E", ["F", "boop"]), - ("F", ["beep"]), - ("F", []), - ] - try: - gen = GenerateLR0("E", grammar_nullable) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - print("grammar_lr0_shift_reduce (SLR1):") - dump_grammar(grammar_lr0_shift_reduce) - gen = GenerateSLR1("E", grammar_lr0_shift_reduce) - print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") +# Nor does this: it has a reduce/reduce conflict. +grammar_lr0_reduce_reduce = grammar_simple + [ + ('E', ['V', '=', 'E']), + ('V', ['id']), +] +try: + gen = GenerateLR0('E', grammar_lr0_reduce_reduce) table = gen.gen_table() - print(format_table(gen, table)) - tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) - print(format_node(tree) + "\n") - print() + assert False +except ValueError as e: + print(e) +print() - # SLR1 can't handle this. - print("grammar_aho_ullman_1 (SLR1):") - grammar_aho_ullman_1 = [ - ("S", ["L", "=", "R"]), - ("S", ["R"]), - ("L", ["*", "R"]), - ("L", ["id"]), - ("R", ["L"]), - ] - try: - gen = GenerateSLR1("S", grammar_aho_ullman_1) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Here's an example with a full LR1 grammar, though. - print("grammar_aho_ullman_2 (LR1):") - grammar_aho_ullman_2 = [ - ("S", ["X", "X"]), - ("X", ["a", "X"]), - ("X", ["b"]), - ] - gen = GenerateLR1("S", grammar_aho_ullman_2) +# Nullable symbols just don't work with constructs like this, because you can't +# look ahead to figure out if you should reduce an empty 'F' or not. +grammar_nullable = [ + ('E', ['F', 'boop']), + ('F', ['beep']), + ('F', []), +] +try: + gen = GenerateLR0('E', grammar_nullable) table = gen.gen_table() - print(format_table(gen, table)) - parse(table, ["b", "a", "a", "b"], trace=True) - print() + assert False +except ValueError as e: + print(e) - # What happens if we do LALR to it? - print("grammar_aho_ullman_2 (LALR):") - gen = GenerateLALR("S", grammar_aho_ullman_2) +gen = GenerateSLR1('E', grammar_lr0_shift_reduce) +print("First: {first}".format(first=str(gen.gen_first(['E'])))) +print("Follow: {follow}".format(follow=str(gen.gen_follow('E')))) +table = gen.gen_table() +print(format_table(gen, table)) +tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')']) +print(format_node(tree) + "\n") +print() + +# SLR1 can't handle this. +grammar_aho_ullman_1 = [ + ('S', ['L', '=', 'R']), + ('S', ['R']), + ('L', ['*', 'R']), + ('L', ['id']), + ('R', ['L']), +] +try: + gen = GenerateSLR1('S', grammar_aho_ullman_1) table = gen.gen_table() - print(format_table(gen, table)) - print() + assert False +except ValueError as e: + print(e) +print() - # A fun LALAR grammar. - print("grammar_lalr:") - grammar_lalr = [ - ("S", ["V", "E"]), - ("E", ["F"]), - ("E", ["E", "+", "F"]), - ("F", ["V"]), - ("F", ["int"]), - ("F", ["(", "E", ")"]), - ("V", ["id"]), - ] - gen = GenerateLALR("S", grammar_lalr) - table = gen.gen_table() - print(format_table(gen, table)) - print() +# Here's an example with a full LR1 grammar, though. +grammar_aho_ullman_2 = [ + ('S', ['X', 'X']), + ('X', ['a', 'X']), + ('X', ['b']), +] +gen = GenerateLR1('S', grammar_aho_ullman_2) +table = gen.gen_table() +print(format_table(gen, table)) +parse(table, ['b', 'a', 'a', 'b'], trace=True) +print() +# What happens if we do LALR to it? +gen = GenerateLALR('S', grammar_aho_ullman_2) +table = gen.gen_table() +print(format_table(gen, table)) +print() -if __name__ == "__main__": - examples() +# A fun LALAR grammar. +grammar_lalr = [ + ('S', ['V', 'E']), + + ('E', ['F']), + ('E', ['E', '+', 'F']), + + ('F', ['V']), + ('F', ['int']), + ('F', ['(', 'E', ')']), + + ('V', ['id']), +] +gen = GenerateLALR('S', grammar_lalr) +table = gen.gen_table() +print(format_table(gen, table)) +print() diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 7cf2884..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,13 +0,0 @@ -[project] -name = "lrparsers" -descrption = "a small LR parser generator library" -authors = [ - {name = "John Doty", email = "john@d0ty.me"}, -] -classifiers = [ - "Private :: Do Not Upload", # Probably. - "License :: OSI Approved :: MIT License", -] - -[tool.black] -line-length=100 \ No newline at end of file