<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Codepoet's Blog</title>
	<atom:link href="http://code2code.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://code2code.wordpress.com</link>
	<description>Code 2 Code</description>
	<lastBuildDate>Wed, 04 Feb 2009 13:13:49 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='code2code.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Codepoet's Blog</title>
		<link>http://code2code.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://code2code.wordpress.com/osd.xml" title="Codepoet&#039;s Blog" />
	<atom:link rel='hub' href='http://code2code.wordpress.com/?pushpress=hub'/>
		<item>
		<title>Pure Python bindings for Horde3D</title>
		<link>http://code2code.wordpress.com/2009/02/04/pure-python-bindings-for-horde3d/</link>
		<comments>http://code2code.wordpress.com/2009/02/04/pure-python-bindings-for-horde3d/#comments</comments>
		<pubDate>Wed, 04 Feb 2009 13:13:49 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[python]]></category>
		<category><![CDATA[binding]]></category>
		<category><![CDATA[horde3d]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=333</guid>
		<description><![CDATA[I&#8217;ve written new Python bindings for the 3D engine Horde3D using ctypes. No more compilation / binary modules. You can get the wrapper from the community svn. Posted in python Tagged: binding, horde3d, python<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=333&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I&#8217;ve written new Python bindings for the 3D engine <a href="http://horde3d.org">Horde3D</a> using ctypes. No more compilation / binary modules. You can get the wrapper from the <a href="http://mm-werkstatt.informatik.uni-augsburg.de/public/Horde3D/trunk/Horde3D/Bindings/Python/">community svn</a>.</p>
<br />Posted in python Tagged: binding, horde3d, python <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/333/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/333/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/333/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/333/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/333/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/333/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/333/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/333/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=333&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2009/02/04/pure-python-bindings-for-horde3d/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Exoself Progress</title>
		<link>http://code2code.wordpress.com/2009/02/02/exoself-progress-2/</link>
		<comments>http://code2code.wordpress.com/2009/02/02/exoself-progress-2/#comments</comments>
		<pubDate>Mon, 02 Feb 2009 19:28:03 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[exoself]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=327</guid>
		<description><![CDATA[After my final exam at university I had some time to work again on Exoself: The compiler now supports adding debug info for structs. That means any struct can be examined at runtime using a debugger like ddd. This even works for recursive structs (linked lists etc.): The next steps are implementing the basics of [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=327&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>After my final exam at university I had some time to work again on Exoself: The compiler now supports adding debug info for structs. That means any struct can be examined at runtime using a debugger like ddd. This even works for recursive structs (linked lists etc.):</p>
<div id="attachment_328" class="wp-caption alignnone" style="width: 460px"><img class="size-full wp-image-328" title="DDD 017_structs.es" src="http://code2code.files.wordpress.com/2009/02/debugger_017structs.png?w=450&#038;h=277" alt="Debugging 017_structs.es" width="450" height="277" /><p class="wp-caption-text">Debugging 017_structs.es</p></div>
<p>The next steps are implementing the basics of the runtime and garbage collection. I hope to make some good progress before I start with my diploma thesis.</p>
<br />Posted in exoself Tagged: exoself <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/327/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/327/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/327/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/327/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/327/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/327/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/327/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/327/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=327&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2009/02/02/exoself-progress-2/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2009/02/debugger_017structs.png" medium="image">
			<media:title type="html">DDD 017_structs.es</media:title>
		</media:content>
	</item>
		<item>
		<title>Exoself Progress</title>
		<link>http://code2code.wordpress.com/2009/01/11/exoself-progress/</link>
		<comments>http://code2code.wordpress.com/2009/01/11/exoself-progress/#comments</comments>
		<pubDate>Sun, 11 Jan 2009 01:03:37 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[exoself]]></category>
		<category><![CDATA[compiler]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=316</guid>
		<description><![CDATA[Exoself is progressing slowly but it has gained some new features since my last post: global variables function pointers using the syntax function(typeArg1, typeArg2, typeArg3, ...) as returnType debug data for use with gdb / ddd featuring source code based debugging with &#8220;next&#8221; / &#8220;step&#8221; etc. local variable type and content can be viewed for [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=316&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Exoself is progressing slowly but it has gained some new features since my last post:</p>
<ul>
<li>global variables</li>
<li>function pointers using the syntax <code>function(typeArg1, typeArg2, typeArg3, ...) as returnType</code></li>
<li>debug data for use with gdb / ddd featuring
<ul>
<li>source code based debugging with &#8220;next&#8221; / &#8220;step&#8221; etc.</li>
<li>local variable type and content can be viewed for the basic data types</li>
</ul>
</li>
</ul>
<p>Here&#8217;s a screenshot showing nbody.es debugged by ddd:<br />
<div id="attachment_319" class="wp-caption alignnone" style="width: 460px"><img class="size-full wp-image-319" title="Debugging nbody.es" src="http://code2code.files.wordpress.com/2009/01/debugger_nbody.png?w=450&#038;h=277" alt="Debugging nbody.es in ddd" width="450" height="277" /><p class="wp-caption-text">Debugging nbody.es in ddd</p></div></p>
<p>I&#8217;m currently working on extending the debugging data further, so that variables of derived types (especially structs) can be viewed in the debugger. After that I&#8217;ll probably start the runtime with a basic garbage collector.</p>
<br />Posted in exoself Tagged: compiler, exoself <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/316/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/316/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/316/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/316/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/316/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/316/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/316/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/316/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=316&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2009/01/11/exoself-progress/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2009/01/debugger_nbody.png" medium="image">
			<media:title type="html">Debugging nbody.es</media:title>
		</media:content>
	</item>
		<item>
		<title>LLVM Benchmark</title>
		<link>http://code2code.wordpress.com/2008/12/14/llvm-benchmark/</link>
		<comments>http://code2code.wordpress.com/2008/12/14/llvm-benchmark/#comments</comments>
		<pubDate>Sun, 14 Dec 2008 22:42:34 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[Uncategorized]]></category>
		<category><![CDATA[compiler]]></category>
		<category><![CDATA[llvm]]></category>
		<category><![CDATA[exoself]]></category>
		<category><![CDATA[benchmark]]></category>
		<category><![CDATA[shootout]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=310</guid>
		<description><![CDATA[I just read about a benchmark comparing LLVM and gcc: LLVM vs GCC. And the results look really good with the exception of nbody. As I&#8217;m using LLVM as a backend for Exoself I expect to get similar performance. So far I&#8217;ve only ported the nbody benchmark from The Computer Language Benchmarks Game and it [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=310&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I just read about a benchmark comparing LLVM and gcc: <a href="http://leonardo-m.livejournal.com/73732.html">LLVM vs GCC</a>. And the results look really good with the exception of nbody.</p>
<p>As I&#8217;m using LLVM as a backend for Exoself I expect to get similar performance. So far I&#8217;ve only ported the nbody benchmark from <a href="http://shootout.alioth.debian.org">The Computer Language Benchmarks Game</a> and it was slower by a factor of 1.5 to 2 than gcc depending on the CPU.</p>
<br />Posted in Uncategorized Tagged: benchmark, compiler, exoself, llvm, shootout <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/310/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/310/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/310/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/310/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/310/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/310/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/310/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/310/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=310&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/12/14/llvm-benchmark/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Compiler Testing</title>
		<link>http://code2code.wordpress.com/2008/12/12/compiler-testing/</link>
		<comments>http://code2code.wordpress.com/2008/12/12/compiler-testing/#comments</comments>
		<pubDate>Fri, 12 Dec 2008 16:16:52 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[Compiler Writing]]></category>
		<category><![CDATA[bdd]]></category>
		<category><![CDATA[compiler]]></category>
		<category><![CDATA[correctness]]></category>
		<category><![CDATA[tdd]]></category>
		<category><![CDATA[testing]]></category>
		<category><![CDATA[unit testing]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=292</guid>
		<description><![CDATA[How do you make sure that a compiler really works? You write some kind of automated tests. But what kind of test is useful and is not too much additional work? The goal of testing is to make sure that the generated programs do what was specified in the source code. Here we need to [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=292&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>How do you make sure that a compiler really works? You write some kind of automated tests. But what kind of test is useful and is not too much additional work? </p>
<p>The goal of testing is to make sure that the generated programs do what was specified in the source code. Here we need to make sure that we don&#8217;t introduce any errors on several stages during compilation: Lexing / parsing, desugaring, type checking / AST annotation and finally code generation.</p>
<p>The ideas presented here are generally applicable to non-interactive command line programs. Even if you are writing a GUI application it <em>can</em> be a good idea to write first a shared library with a command line interface. That makes testing much easier. Later you can still write a GUI that wraps the functionality in a simple way for the end user.</p>
<h3>Test Driven Development</h3>
<p>This post will refer to TDD as writing tests first and only then writing new code. Of course in practice there are situations where you&#8217;ll not write tests or write them after you discover bugs. The strictness of this approach depends on the needs of your project and its need for correctness.</p>
<p>TDD is an iterative approach to writing code: First write some new tests which will fail due to missing functionality. Then write code until the new tests pass and finally check if all other tests still pass. To make this testing as simple as possible it&#8217;s important to have an automated way of running the tests.</p>
<p>In Exoself the tests are integrated into the build system. It&#8217;s trivially easy to either run the whole test suite or just run a few selected tests. The build system even knows when parts of the compiler implementation changed and automatically rebuilds the tests on demand.</p>
<p>Running only a subset of all tests gets more important when your project grows and the test suite takes to long to complete. Even one ore two minutes might be too long to get feedback regarding changes. So often you&#8217;ll want to run the whole test suite on a dedicated machine after source code checkins. A tool like <a href="http://buildbot.net/">Buildbot</a> can automate this.</p>
<p>Of course there are many ways to write tests. I&#8217;ll present two techniques here: unit testing and behavior testing.</p>
<h3>Unit Testing</h3>
<p>This method tries to make sure that individual units of your program work correctly. Often this means a unit test per class, source file or module. Ideally it&#8217;s a very small or even elementary part of you application which you test.</p>
<p>When testing classes you write test cases for the public interface of the class, not the implementation details. When writing tests first this also defines the public interface of the class from a user perspective instead of the class implementor perspective which makes sure that the interface is usable.</p>
<p>Most classes depend on other classes which provide file / database / network access which should not be tested. To avoid this dependence often mock objects are introduced which implement the same interface as the original class, but only make sure that methods are called in the right order and with correct parameters.</p>
<p>I don&#8217;t like this way of writing tests: It&#8217;s too much additional work and too low level for me.</p>
<h3>Behavior Testing</h3>
<p>Testing the behavior means that I verify that the output of my program is correct and not any part or unit. In the case of a compiler that&#8217;s very simple: Write code in the source language, compile the program and run it. During runtime check with assert statements if runtime calculations match the expected results. Instead of runtime checking you can also of course compare the output of your compiler / program with a reference file.</p>
<p>If you&#8217;ve participated in programming competitions like ACM or TopCoder you&#8217;ll already know how it works. In these competitions you write a small program and submit it to an automated judge. Then it&#8217;s compiled and run for every test case which is a plain text input. The output of your program is then compared to a reference output and you&#8217;ll either pass or fail. The major difference to our situation is, that the test inputs and expected outputs are unknown as are the test case which failed.</p>
<p>The behavior testing approach has several benefits compared to unit testing: The program as a whole is tested, not any unit. Of course that means when a test fails it&#8217;s more difficult to find the bug in the code but on the other hand you know that certain inputs work correctly. You don&#8217;t need to provide mock objects and it&#8217;s often less work than unit testing: Just writing programs in the source language of your compiler should be easy.</p>
<p>Making sure that compiled programs work is difficult with a new programming language: No input / output and maybe even missing control flow / assert makes it hard. But for the first tests you can always construct the AST of the source by hand and check that or use the return value of the program.</p>
<h3>Test Coverage</h3>
<p>Especially with unit testing it can be tempting to get 100% test coverage. Coverage may be defined in several ways, for example functions / statement / path coverage. Path coverage is the most interesting one to guarantee correctness but means you&#8217;ll need a really big test suite &#8211; in practice that&#8217;s either impossible or just too much work for any non trivial program. And a compiler has the added problem that there are infinitely many valid inputs which makes things not really easier&#8230;</p>
<p>Behavior testing is easier: You have some kind of definition of the input language, which yields a finite number of basic tests which define the grammar and semantics. Adding some composite tests to the test suite should provide a good enough indicator for correctness of the compiler.</p>
<p>Testing just provides a lower bound of correctness: Everything which you checked works &#8211; everything also may or may not work. You can always fool your test suite by hard coding all inputs, then all tests work but nothing else.</p>
<h3>Automatic Testing</h3>
<p>Instead of writing many behavior tests by hand, this task can also be automated to some degree. For example to test operator precedence you could write a program that generates random mathematical expressions. The generated source code consists of the expression and the expected result.</p>
<p>Of course to compute the expected result you have to duplicate the language semantics in the expression generator. That&#8217;s easy if the expression generator is written in a language that has the same semantics for these expressions. Writing this program in your new language is probably a bad idea, since you can not trust the expected values this way.</p>
<p>As I&#8217;ll most probably use Python to write the random expression generator it&#8217;s not that trivial: Even simple expressions behave different, due to overflow of basic types like int32 or int64 which can not occur in Python. I&#8217;ll probably make the generator generic, so that I can generate code in different languages. As I expect the C / C++ implementation to be correct I&#8217;ll just use it to calculate the expected values. Exoself will often have the same semantics &#8211; at least in regard to the mathematical expressions.</p>
<p>This idea can be extended even to whole programs including control flow: <a href="http://www.cs.utah.edu/~eeide/emsoft08/">Volatile-Testing Tools</a>. This program generates random programs to check whether a C compiler handles volatiles correctly.</p>
<h3>Conclusion</h3>
<p>The Exoself test suite consists of many behavior tests, that prove constantly to be invaluable. It&#8217;s so much easier to make changes, small or large, and know that everything still works. The test suite is also a replacement, at least for now, for a language specification.</p>
<p>It will be interesting to see how I can improve my test suite by adding random generated programs in the future. If you know about any tools in this area, I&#8217;d be interested to hear about them.</p>
<p>Read more here: <a href="http://code2code.wordpress.com/compiler-writing-series/">Compiler Writing Series</a></p>
<br />Posted in Compiler Writing Tagged: bdd, compiler, correctness, tdd, testing, unit testing <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/292/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/292/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/292/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/292/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/292/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/292/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/292/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/292/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=292&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/12/12/compiler-testing/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Exoself source released</title>
		<link>http://code2code.wordpress.com/2008/12/03/exoself-source-released/</link>
		<comments>http://code2code.wordpress.com/2008/12/03/exoself-source-released/#comments</comments>
		<pubDate>Wed, 03 Dec 2008 19:20:21 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[exoself]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[compiler]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=281</guid>
		<description><![CDATA[I&#8217;ve decided to push a copy of the Exoself code base to github. If you are interested to see how I&#8217;m implementing Exoself view it / try it out: Exoself on github. This is a technology preview without support and not an end user release. Here&#8217;s a list of features working so far: basic mathematical [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=281&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I&#8217;ve decided to push a copy of the Exoself code base to github. If you are interested to see how I&#8217;m implementing Exoself view it / try it out: <a href="http://github.com/fnoeding/exoself/tree/master">Exoself on github</a>.</p>
<p>This is a technology preview without support and not an end user release.</p>
<p>Here&#8217;s a list of features working so far:</p>
<ul>
<li>basic mathematical operators</li>
<li>user defined functions</li>
<li>control flow</li>
<li>binary functions as operators</li>
<li>augmented assign, list assign</li>
<li>very basic module system: &#8220;from module import *&#8221;</li>
<li>type system supporting basic data types like int32 or float64, pointers and structs</li>
<li>limited function overloading</li>
<li>accessing C functions</li>
<li>ASCII raw strings (prefix: ar)</li>
<li>basic integration into the build system <a href="http://code.google.com/p/waf/">waf</a></li>
<li>local type inference</li>
</ul>
<p>Still somewhat problematic is output, since I don&#8217;t have support for variadic functions like printf. A runtime is also missing, so no garbage collection at the moment.</p>
<p>Platforms:<br />
Linux: should just work in most cases as Ubuntu is my development platform. Gentoo needs some help when compiling llvm-py. See <a href="http://code.google.com/p/llvm-py/issues/detail?id=14">llvm-py issue 14</a>.</p>
<p>Mac OS: The compiler itself seems to work. You&#8217;ll probably have to remove the hacks and runtime libraries from the build system in the root wscript file. Just remove them at the top of the file in the &#8220;dirs = &#8216;compiler runtime hacks tests&#8217;.split()&#8221; line.<br />
(if you know where to get a free or reasonably cheap developer account with shell access please write a comment / an email)</p>
<p>Windows: not tested; I have no idea how much work it will be to get it to work</p>
<h3>&nbsp;</h3>
<p>Due to an important exam I won&#8217;t have much time to work on Exoself or write blog posts in the next two weeks.</p>
<br />Posted in exoself, python Tagged: compiler, exoself <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/281/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/281/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/281/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/281/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/281/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/281/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/281/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/281/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=281&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/12/03/exoself-source-released/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Code Generation 2</title>
		<link>http://code2code.wordpress.com/2008/12/01/code-generation-2/</link>
		<comments>http://code2code.wordpress.com/2008/12/01/code-generation-2/#comments</comments>
		<pubDate>Mon, 01 Dec 2008 17:38:17 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[Compiler Writing]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[ast]]></category>
		<category><![CDATA[compiler]]></category>
		<category><![CDATA[llvm]]></category>
		<category><![CDATA[llvm-py]]></category>
		<category><![CDATA[mem2reg]]></category>
		<category><![CDATA[ssa]]></category>
		<category><![CDATA[variables]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=268</guid>
		<description><![CDATA[In the previous post I&#8217;ve shown how to compile mathematical expressions to executable code. In this post I&#8217;ll show how to add mutable variables to the expression compiler. Single Static Assignment LLVM uses a SSA form to represent variables. In SSA form a variable can be assigned a value only once. This makes it easier [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=268&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>In the previous post I&#8217;ve shown how to compile mathematical expressions to executable code. In this post I&#8217;ll show how to add mutable variables to the expression compiler.</p>
<h3>Single Static Assignment</h3>
<p>LLVM uses a <a href="http://en.wikipedia.org/wiki/Static_single_assignment_form">SSA</a> form to represent variables. In SSA form a variable can be assigned a value only once. This makes it easier to write optimizations, since analyzing when a variable changes is not necessary. But while generating code this is problematic: Instead of changing the value of a variable we have to add a new variable. When using control flow like if or while this gets even more complicated. We need to add PHI nodes which carry information about predecessors of basic blocks.</p>
<p>Instead of doing things manually there&#8217;s a simpler way: LLVM does not use the SSA form for memory, so instead of using normal variables we use memory to store our data. To avoid any memory management issues we&#8217;ll allocate the memory on the stack using alloca. So when the current function returns the stack is popped and the memory automatically freed.</p>
<p>The great advantage of this solution is that it&#8217;s really easy to generate code this way. But there&#8217;s a catch: Every access to a variable now involves a load from / store to memory and that&#8217;s really inefficient compared to storing data in registers. Fortunately LLVM provides an optimization pass that can promote memory accesses to register variables and inserts the necessary PHI nodes.</p>
<p>This optimization pass is called &#8220;mem2reg&#8221; and has some limitations: </p>
<ul>
<li>it only works with alloca, so heap based memory or global variables are not transformed</li>
<li>it only looks for allocas in the entry basic block of functions</li>
<li>it can only promote allocas which are used for plain stores and loads &#8211; no address taking or pointer arithmetic is allowed</li>
<li>it does only work for simple data types such as scalars, vectors and pointers. Especially structs or arrays are not supported.</li>
</ul>
<p>So in summary the most important one is that allocas must be placed in the entry block of a function for promotion. Structs and arrays are not relevant for this example, but there are more powerful optimization passes which can promote also structs and arrays to registers.</p>
<h3>Extending the compiler</h3>
<p>To work with variables we&#8217;ll need an assignment operator and variable names. So I added two entries to the TreeType class: ASSIGN and VARIABLE_NAME:</p>
<pre class="brush: python;">
class TreeType(object):
    MODULE = 1

    PLUS = 10
    MINUS = 11
    STAR = 12
    SLASH = 13

    INTEGER_CONSTANT = 100

    VARIABLE_NAME = 200

    ASSIGN = 300
</pre>
<p>In the ASTWalker class the dispatch function must also support these two new AST node types:</p>
<pre class="brush: python;">
        # ...
        elif ast.type == tt.INTEGER_CONSTANT:
            callee = self._onIntegerConstant
            kwargs['value'] = int(ast.text)
        elif ast.type == tt.ASSIGN:
            callee = self._onAssign
            kwargs['variable'] = ast.children[0]
            kwargs['expression'] = ast.children[1]
        elif ast.type == tt.VARIABLE_NAME:
            callee = self._onVariable
            kwargs['variable'] = ast
        else:
            assert(0 and 'dead code path')
        # ...
</pre>
<p>Since we don&#8217;t have a semantic phase in this compiler we have to manage the symbol table manually. So I&#8217;ve added it to the constructor of the CodeGen class:</p>
<pre class="brush: python;">
    def __init__(self, *k, *kw):
        ASTWalker.__init__(self, *k, **kw)

        self._symbols = {}
</pre>
<p>We also need to keep a reference to the current function accessible during an assignment to create the alloca in the entry block of the function instead of the current block. Otherwise the mem2reg pass won&#8217;t work.</p>
<pre class="brush: python;">
        # ...
        func = self._module.add_function(funcType, 'main')
        self._currentFunction = func
        # ...
</pre>
<p>Now to the more interesting part: Generating code for an assignment.</p>
<pre class="brush: python;">
    def _onAssign(self, ast, variable, expression):
        self._dispatch(expression)

        assert(variable.type == TreeType.VARIABLE_NAME)

        varName = variable.text
        if varName not in self._symbols:
            # create a new variable
            # since we want to avoid SSA form we'll put the variable on the stack and let the optimizer do the transformation to SSA

            # alloca must be in the entry block otherwise the optimizer will not transform it
            # get current function and the entry block (the name 'entry' above was only for human readers, it's the first basic block of a function)
            llvmFunc = self._currentFunction
            entryBB = llvmFunc.get_entry_basic_block()

            # create a builder for the entry block and insert new instructions at the beginning --&gt; before the branch instruction
            # order of the alloca's does not really matter
            entryBuilder = Builder.new(entryBB)
            if entryBB.instructions: # avoid segfaults if empty
                entryBuilder.position_at_beginning(entryBB)

            # reserve memory on the stack
            llvmRef = entryBuilder.alloca(Type.int(32), varName)

            # store reference to variable in symbol table
            self._symbols[varName] = llvmRef
        else:
            # use existing variable
            llvmRef = self._symbols[varName]

        # store data to memory location
        self._currentBuilder.store(expression.llvmValue, llvmRef)

        # we also have to return a value
        ast.llvmValue = expression.llvmValue
</pre>
<p>After checking if the variable already exists either insert an alloca into the entry block or retrieve the reference from the symbol table and then store the result of the expression to that memory location.</p>
<p>And finally the handler for variables inside expressions:</p>
<pre class="brush: python;">
    def _onVariable(self, ast, variable):
        varName = variable.text

        # look up symbol
        llvmRef = self._symbols[varName]

        # load value and return it
        ast.llvmValue = self._currentBuilder.load(llvmRef)
</pre>
<p>This just looks up the reference in the symbol table and then loads the data from memory.</p>
<p>That&#8217;s it. Here&#8217;s the complete program:</p>
<pre class="brush: python;">
#!/usr/bin/python
import sys

sys.path.append('/home/fnoeding/exoself/3rdparty/pylibs')

from llvm.core import *
import copy

class Tree(object):
	def __init__(self, type, text, children=None):
		self.type = type
		self.text = text

		if children:
			self.children = children
		else:
			self.children = [] # do not use a default value of children = [] in the function param list!

	def copy(self, withChildren):
		if withChildren:
			return copy.deepcopy(self)
		else:
			c = copy.copy(self)
			c.children = []
			return c

class TreeType(object):
	MODULE = 1

	PLUS = 10
	MINUS = 11
	STAR = 12
	SLASH = 13

	INTEGER_CONSTANT = 100

	VARIABLE_NAME = 200

	ASSIGN = 300

class ASTWalker(object):
	def __init__(self):
		self._nodes = []

	def walkAST(self, ast):
		raise NotImplementedError('subclasses must implement walkAST')

	def _dispatch(self, ast):
		tt = TreeType

		kwargs = {}
		kwargs['ast'] = ast

		if ast.type == tt.MODULE:
			callee = self._onModule
			kwargs['statements'] = ast.children
		elif ast.type in [tt.PLUS, tt.MINUS, tt.STAR, tt.SLASH]:
			callee = self._onOperator

			op = ast.type
			if len(ast.children) == 1:
				arg1 = ast.children[0]
				arg2 = None
			elif len(ast.children) == 2:
				arg1 = ast.children[0]
				arg2 = ast.children[1]
			else:
				assert(0 and 'dead code path')

			kwargs['op'] = op
			kwargs['arg1'] = arg1
			kwargs['arg2'] = arg2
		elif ast.type == tt.INTEGER_CONSTANT:
			callee = self._onIntegerConstant
			kwargs['value'] = int(ast.text)
		elif ast.type == tt.ASSIGN:
			callee = self._onAssign
			kwargs['variable'] = ast.children[0]
			kwargs['expression'] = ast.children[1]
		elif ast.type == tt.VARIABLE_NAME:
			callee = self._onVariable
			kwargs['variable'] = ast
		else:
			assert(0 and 'dead code path')

		callee(**kwargs)

class CodeGen(ASTWalker):
	def __init__(self, *k, **kw):
		ASTWalker.__init__(self, *k, **kw)

		self._symbols = {}

	def walkAST(self, ast):
		self._dispatch(ast)

		return self._module

	def _addHelpers(self):
		# you might want to skip this function on a first read. It adds essentially a function to print integers to stdout.

		# add a prototype for printf
		# int printf(char*, ...)
		funcType = Type.function(Type.int(32), [Type.pointer(Type.int(8))], True)
		printf = self._module.add_function(funcType, 'printf')

		# add a function to print integers to stdout using printf
		# void printInt(int x) { printf(&quot;%d\n&quot;, x); }
		funcType = Type.function(Type.void(), [Type.int(32)])
		printInt = self._module.add_function(funcType, 'printInt')
		self._printInt = printInt # save for later use in _onModule

		# create a block and a builder for printInt
		bb = printInt.append_basic_block('bb')
		b = Builder.new(bb)

		# create a global constant to hold the first argument of printf
		stringConst = Constant.stringz('%d\n') # zero terminated --&gt; stringz instead of string
		string = self._module.add_global_variable(stringConst.type, '__internalGlobalConst')
		string.initializer = stringConst
		string.global_constant = True
		string.linkage = LINKAGE_INTERNAL # not strictly necessary here, but this global should only be available during link time in the current module

		# address calculation
		# every index traverses a pointer without derefencing.
		# gep (get element pointer) does only address calculation, no memony accesses!
		idx = [Constant.int(Type.int(32), 0), Constant.int(Type.int(32), 0)] # the first index get's us past the global variable (which is a pointer) to the string; the second index is the offset inside the string we want to access
		realAddr = string.gep(idx) # get real address

		# call printf
		b.call(printf, [realAddr, printInt.args[0]])
		b.ret_void()

	def _onModule(self, ast, statements):
		# create a new LLVM module, a container for global variables and functions
		self._module = Module.new('mymodule')

		# add some helpers
		self._addHelpers()

		# add a function: 'int main()'
		funcType = Type.function(Type.int(32), [])
		func = self._module.add_function(funcType, 'main')
		self._currentFunction = func

		# create an 'entry' basic block, if we want to introduce variables we'll need it anyway
		# execution of the function starts here
		entryBB = func.append_basic_block('entry')
		entryBuilder = Builder.new(entryBB)

		# normal code should be inserted into the second block
		bb = func.append_basic_block('bb')

		# also jump to this block from the entry block
		entryBuilder.branch(bb)

		# add the code of the function
		self._currentBuilder = Builder.new(bb)
		for x in statements:
			self._dispatch(x)
			self._currentBuilder.call(self._printInt, [x.llvmValue])

		# main should return 0
		self._currentBuilder.ret(Constant.int(Type.int(32), 0))

		# verify the module
		self._module.verify()

	def _onOperator(self, ast, op, arg1, arg2):
		tt = TreeType

		self._dispatch(arg1)
		if arg2:# some operators are unary
			self._dispatch(arg2)

		cb = self._currentBuilder
		if op == TreeType.PLUS:
			if arg2:
				ast.llvmValue = cb.add(arg1.llvmValue, arg2.llvmValue)
			else:
				ast.llvmValue = arg1.llvmValue # +NUMBER == NUMBER
		elif op == TreeType.MINUS:
			if arg2:
				ast.llvmValue = cb.sub(arg1.llvmValue, arg2.llvmValue)
			else:
				ast.llvmValue = cb.sub(Constant.int(Type.int(32), 0), arg1.llvmValue) # -NUMBER == 0 - NUMBER
		elif op == TreeType.STAR:
			ast.llvmValue = cb.mul(arg1.llvmValue, arg2.llvmValue)
		elif op == TreeType.SLASH:
			ast.llvmValue = cb.sdiv(arg1.llvmValue, arg2.llvmValue)
		else:
			assert(0 and 'dead code path')

	def _onIntegerConstant(self, ast, value):
		ast.llvmValue = Constant.int(Type.int(32), value)

	def _onAssign(self, ast, variable, expression):
		self._dispatch(expression)

		assert(variable.type == TreeType.VARIABLE_NAME)

		varName = variable.text
		if varName not in self._symbols:
			# create a new variable
			# since we want to avoid SSA form we'll put the variable on the stack and let the optimizer do the transformation to SSA

			# alloca must be in the entry block otherwise the optimizer will not transform it
			# get current function and the entry block (the name 'entry' above was only for human readers, it's the first basic block of a function)
			llvmFunc = self._currentFunction
			entryBB = llvmFunc.get_entry_basic_block()

			# create a builder for the entry block and insert new instructions at the beginning --&gt; before the branch instruction
			# order of the alloca's does not really matter
			entryBuilder = Builder.new(entryBB)
			if entryBB.instructions: # avoid segfaults if empty
				entryBuilder.position_at_beginning(entryBB)

			# reserve memory on the stack
			llvmRef = entryBuilder.alloca(Type.int(32), varName)

			# store reference to variable in symbol table
			self._symbols[varName] = llvmRef
		else:
			# use existing variable
			llvmRef = self._symbols[varName]

		# store data to memory location
		self._currentBuilder.store(expression.llvmValue, llvmRef)

		# we also have to return a value
		ast.llvmValue = expression.llvmValue

	def _onVariable(self, ast, variable):
		varName = variable.text

		# look up symbol
		llvmRef = self._symbols[varName]

		# load value and return it
		ast.llvmValue = self._currentBuilder.load(llvmRef)

def createSampleAST():
	# do not reuse any variables! overwrite them first!
	exprs = []

	# 9 - 3 * 3
	int9 = Tree(TreeType.INTEGER_CONSTANT, '9')
	int3a = Tree(TreeType.INTEGER_CONSTANT, '3')
	int3b = Tree(TreeType.INTEGER_CONSTANT, '3')

	star = Tree(TreeType.STAR, '*', [int3a, int3b])
	minus = Tree(TreeType.MINUS, '-', [int9, star])
	exprs.append(minus)

	# 4 + 76 / 2
	int4 = Tree(TreeType.INTEGER_CONSTANT, '4')
	int76 = Tree(TreeType.INTEGER_CONSTANT, '76')
	int2 = Tree(TreeType.INTEGER_CONSTANT, '2')

	slash = Tree(TreeType.SLASH, '/', [int76, int2])
	plus = Tree(TreeType.PLUS, '+', [int4, slash])
	exprs.append(plus)

	# ---21
	int21 = Tree(TreeType.INTEGER_CONSTANT, '21')
	minus = Tree(TreeType.MINUS, '-', [int21])
	minus = Tree(TreeType.MINUS, '-', [minus])
	minus = Tree(TreeType.MINUS, '-', [minus])
	exprs.append(minus)

	# a = 7 * 42
	int7 = Tree(TreeType.INTEGER_CONSTANT, '7')
	int42 = Tree(TreeType.INTEGER_CONSTANT, '42')
	star = Tree(TreeType.STAR, '*', [int7, int42])
	varA = Tree(TreeType.VARIABLE_NAME, 'a')
	assign = Tree(TreeType.ASSIGN, '=', [varA, star])
	exprs.append(assign)

	# a / 6
	varA = Tree(TreeType.VARIABLE_NAME, 'a')
	int6 = Tree(TreeType.INTEGER_CONSTANT, '6')
	slash = Tree(TreeType.SLASH, '/', [varA, int6])
	exprs.append(slash)

	module = Tree(TreeType.MODULE, '', exprs)

	return module

def main():
	# get an AST, should be replaced by a lexer + parser frontend
	ast = createSampleAST()

	codegen = CodeGen()
	module = codegen.walkAST(ast)
	print module

	# to run the generated code do:
	#     ./minicompiler.py | llvm-as | lli
	# or
	#     ./minicompiler.py &gt; out.ll
	#     llvm-as out.ll
	#     lli out.bc
	# and to generate native code skip the lli above and then
	#     llc out.bc
	#     gcc out.s
	#     ./a.out

	# running the optimizer (only mem2reg pass) and print it human readable
	#     ./minicompiler | llvm-as | opt -mem2reg | llvm-dis

if __name__ == '__main__':
	main()
</pre>
<h3>Results of mem2reg</h3>
<p>Without the mem2reg pass the output should look like this:</p>
<pre class="brush: cpp;">
; ModuleID = 'mymodule'
@__internalGlobalConst = internal constant [4 x i8] c&quot;%dA0&quot;         ;  [#uses=1]

declare i32 @printf(i8*, ...)

define void @printInt(i32) {
bb:
        %1 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__internalGlobalConst, i32 0, i32 0), i32 %0)           ;  [#uses=0]
        ret void
}

define i32 @main() {
entry:
        %a = alloca i32         ;  [#uses=2]
        br label %bb

bb:             ; preds = %entry
        call void @printInt(i32 0)
        call void @printInt(i32 42)
        call void @printInt(i32 -21)
        store i32 294, i32* %a
        call void @printInt(i32 294)
        %0 = load i32* %a               ;  [#uses=1]
        %1 = sdiv i32 %0, 6             ;  [#uses=1]
        call void @printInt(i32 %1)
        ret i32 0
}
</pre>
<p>As you can see there&#8217;s an alloca in the entry block and store / load instructions. These should be replaced after an mem2reg pass:</p>
<pre class="brush: cpp;">
; ModuleID = ''
@__internalGlobalConst = internal constant [4 x i8] c&quot;%dA0&quot;         ;  [#uses=1]

declare i32 @printf(i8*, ...)

define void @printInt(i32) {
bb:
        %1 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__internalGlobalConst, i32 0, i32 0), i32 %0)           ;  [#uses=0]
        ret void
}

define i32 @main() {
entry:
        br label %bb

bb:             ; preds = %entry
        call void @printInt(i32 0)
        call void @printInt(i32 42)
        call void @printInt(i32 -21)
        call void @printInt(i32 294)
        %0 = sdiv i32 294, 6            ;  [#uses=1]
        call void @printInt(i32 %0)
        ret i32 0
}
</pre>
<p>Looks better!</p>
<h3>&nbsp;</h3>
<p>In the next post I&#8217;ll show how the semantic and code generation phase in Exoself interact to solve some more advanced problems.</p>
<p>Read more here: <a href="http://code2code.wordpress.com/compiler-writing-series/">Compiler Writing Series<a></p>
<br />Posted in Compiler Writing, python Tagged: ast, compiler, llvm, llvm-py, mem2reg, ssa, variables <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/268/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/268/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/268/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/268/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/268/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/268/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/268/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/268/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=268&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/12/01/code-generation-2/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Code generation</title>
		<link>http://code2code.wordpress.com/2008/11/29/code-generation/</link>
		<comments>http://code2code.wordpress.com/2008/11/29/code-generation/#comments</comments>
		<pubDate>Sat, 29 Nov 2008 10:42:40 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[Compiler Writing]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[ast]]></category>
		<category><![CDATA[compiler]]></category>
		<category><![CDATA[llvm]]></category>
		<category><![CDATA[llvm-py]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=249</guid>
		<description><![CDATA[In this post I&#8217;ll describe the basic ideas of generating executable code using LLVM and the Python bindings llvm-py. Let&#8217;s start with some more details about walking the AST. Walking the AST In the last post I&#8217;ve already described that the AST is traversed in postorder. The concrete implementation can be done in several ways. [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=249&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>In this post I&#8217;ll describe the basic ideas of generating executable code using LLVM and the Python bindings <a href="http://mdevan.nfshost.com/llvm-py/">llvm-py</a>. Let&#8217;s start with some more details about walking the AST.</p>
<h3>Walking the AST</h3>
<p>In the last post I&#8217;ve already described that the AST is traversed in postorder. The concrete implementation can be done in several ways. One of them is the <a href="http://en.wikipedia.org/wiki/Visitor_pattern">visitor pattern</a>. Here an external class (or class hierarchy) implements &#8220;visit&#8221; methods. These are overloaded depending on the type of AST node they should process. Since overloading is not possible in this way in Python I&#8217;m using a small variation of this pattern.</p>
<p>As there is no compiler I have to do the dispatching myself using a type &#8211; method mapping. The next step is unpacking the AST nodes. This could be done by using specialized AST classes for every node type or just implementing this functionality in a AST walker class. I choose the latter one.</p>
<p>The concrete idea is to call a dispatch method when you need to process a node. This dispatch method determines the right method to call, unpacks the AST and passes all needed arguments to the callee. The dispatch method also maintains a list of parent nodes. This makes accessing things like symbol tables easier and is simpler to implement than maintaining a parent attribute in each node &#8211; which would make AST transformation more difficult.</p>
<h3>A minimalistic compiler for mathematical expressions</h3>
<p>After trying to explain the code generation using my own compiler for Exoself I realized that a small self contained example would be better to get a start. So here is a compiler for simple mathematical expressions.</p>
<p>In the source code you have to manually define an AST that the code generator should transform to executable code. Every expression is evaluated and the result printed to stdout. I&#8217;ve added many comments explaining what I&#8217;m doing.</p>
<p>Since the code is a bit too wide for the current page layout click on &#8216;view plain&#8217; or copy it into your favorite text editor to read it.</p>
<pre class="brush: python;">
#!/usr/bin/python
import sys

# sys.path.append('path/to/llvm-py') # you may need to setup the path to llvm-py explicitly.

from llvm.core import *
import copy

# Tree is used for AST storage
class Tree(object):
	def __init__(self, type, text, children=None):
		self.type = type
		self.text = text

		if children:
			self.children = children
		else:
			self.children = [] # do not use a default value of children = [] in the function param list!

	def copy(self, withChildren):
		if withChildren:
			return copy.deepcopy(self)
		else:
			c = copy.copy(self)
			c.children = []
			return c

# IDs for the different AST types
class TreeType(object):
	MODULE = 1

	PLUS = 10
	MINUS = 11
	STAR = 12
	SLASH = 13

	INTEGER_CONSTANT = 100

# base class for traversing an AST. Implements dispatch for the above defined node types
class ASTWalker(object):
	def __init__(self):
            pass

	def walkAST(self, ast):
		raise NotImplementedError('subclasses must implement walkAST')

        # dispatch the node to the right method after unpacking it
	def _dispatch(self, ast):
		tt = TreeType

		kwargs = {}
		kwargs['ast'] = ast

		if ast.type == tt.MODULE:
			callee = self._onModule
			kwargs['statements'] = ast.children
		elif ast.type in [tt.PLUS, tt.MINUS, tt.STAR, tt.SLASH]:
			callee = self._onOperator

			op = ast.type
			if len(ast.children) == 1:
				arg1 = ast.children[0]
				arg2 = None
			elif len(ast.children) == 2:
				arg1 = ast.children[0]
				arg2 = ast.children[1]
			else:
				assert(0 and 'dead code path')

			kwargs['op'] = op
			kwargs['arg1'] = arg1
			kwargs['arg2'] = arg2
		elif ast.type == tt.INTEGER_CONSTANT:
			callee = self._onIntegerConstant
			kwargs['value'] = int(ast.text)
		else:
			assert(0 and 'dead code path')

		callee(**kwargs)

class CodeGen(ASTWalker):
	def __init__(self, *k, **kw):
		ASTWalker.__init__(self, *k, **kw)

	def walkAST(self, ast):
		self._dispatch(ast)

		return self._module

	def _addHelpers(self):
		# you might want to skip this function on a first read. It adds essentially a function to print integers to stdout.

		# add a prototype for printf
		# int printf(char*, ...)
		funcType = Type.function(Type.int(32), [Type.pointer(Type.int(8))], True)
		printf = self._module.add_function(funcType, 'printf')

		# add a function to print integers to stdout using printf
		# void printInt(int x) { printf(&quot;%d\n&quot;, x); }
		funcType = Type.function(Type.void(), [Type.int(32)])
		printInt = self._module.add_function(funcType, 'printInt')
		self._printInt = printInt # save for later use in _onModule

		# create a block and a builder for printInt
		bb = printInt.append_basic_block('bb')
		b = Builder.new(bb)

		# create a global constant to hold the first argument of printf
		stringConst = Constant.stringz('%d\n') # zero terminated --&gt; stringz instead of string
		string = self._module.add_global_variable(stringConst.type, '__internalGlobalConst')
		string.initializer = stringConst
		string.global_constant = True
		string.linkage = LINKAGE_INTERNAL # not strictly necessary here, but this global should only be available during link time in the current module

		# address calculation
		# every index traverses a pointer without derefencing.
		# gep (get element pointer) does only address calculation, no memony accesses!
		idx = [Constant.int(Type.int(32), 0), Constant.int(Type.int(32), 0)] # the first index get's us past the global variable (which is a pointer) to the string; the second index is the offset inside the string we want to access
		realAddr = string.gep(idx) # get real address

		# call printf
		b.call(printf, [realAddr, printInt.args[0]])
		b.ret_void()

	def _onModule(self, ast, statements):
		# create a new LLVM module, a container for global variables and functions
		self._module = Module.new('mymodule')

		# add some helpers
		self._addHelpers()

		# add a function: 'int main()'
		funcType = Type.function(Type.int(32), [])
		func = self._module.add_function(funcType, 'main')

		# create an 'entry' basic block, if we want to introduce variables we'll need it anyway
		# execution of the function starts here
		entryBB = func.append_basic_block('entry')
		entryBuilder = Builder.new(entryBB)

		# normal code should be inserted into the second block
		bb = func.append_basic_block('bb')

		# also jump to this block from the entry block
		entryBuilder.branch(bb)

		# add the code of the function
		self._currentBuilder = Builder.new(bb)
		for x in statements:
			self._dispatch(x)
			self._currentBuilder.call(self._printInt, [x.llvmValue])

		# main should return 0
		self._currentBuilder.ret(Constant.int(Type.int(32), 0))

		# verify the module
		self._module.verify()

	def _onOperator(self, ast, op, arg1, arg2):
		tt = TreeType

		self._dispatch(arg1)
		if arg2:# some operators are unary
			self._dispatch(arg2)

		cb = self._currentBuilder
		if op == TreeType.PLUS:
			if arg2:
				ast.llvmValue = cb.add(arg1.llvmValue, arg2.llvmValue)
			else:
				ast.llvmValue = arg1.llvmValue # +NUMBER == NUMBER
		elif op == TreeType.MINUS:
			if arg2:
				ast.llvmValue = cb.sub(arg1.llvmValue, arg2.llvmValue)
			else:
				ast.llvmValue = cb.sub(Constant.int(Type.int(32), 0), arg1.llvmValue) # -NUMBER == 0 - NUMBER
		elif op == TreeType.STAR:
			ast.llvmValue = cb.mul(arg1.llvmValue, arg2.llvmValue)
		elif op == TreeType.SLASH:
			ast.llvmValue = cb.sdiv(arg1.llvmValue, arg2.llvmValue)
		else:
			assert(0 and 'dead code path')

	def _onIntegerConstant(self, ast, value):
		ast.llvmValue = Constant.int(Type.int(32), value)

def createSampleAST():
	# do not reuse any variables! overwrite them first!
	exprs = []

	# 9 - 3 * 3
	int9 = Tree(TreeType.INTEGER_CONSTANT, '9')
	int3a = Tree(TreeType.INTEGER_CONSTANT, '3')
	int3b = Tree(TreeType.INTEGER_CONSTANT, '3')

	star = Tree(TreeType.STAR, '*', [int3a, int3b])
	minus = Tree(TreeType.MINUS, '-', [int9, star])
	exprs.append(minus)

	# 4 + 76 / 2
	int4 = Tree(TreeType.INTEGER_CONSTANT, '4')
	int76 = Tree(TreeType.INTEGER_CONSTANT, '76')
	int2 = Tree(TreeType.INTEGER_CONSTANT, '2')

	slash = Tree(TreeType.SLASH, '/', [int76, int2])
	plus = Tree(TreeType.PLUS, '+', [int4, slash])
	exprs.append(plus)

	# ---21
	int21 = Tree(TreeType.INTEGER_CONSTANT, '21')
	minus = Tree(TreeType.MINUS, '-', [int21])
	minus = Tree(TreeType.MINUS, '-', [minus])
	minus = Tree(TreeType.MINUS, '-', [minus])
	exprs.append(minus)

	module = Tree(TreeType.MODULE, '', exprs)

	return module

def main():
	# get an AST, should be replaced by a lexer + parser frontend
	ast = createSampleAST()

	codegen = CodeGen()
	module = codegen.walkAST(ast)
	print module

	# to run the generated code do:
	#     ./minicompiler.py | llvm-as | lli
	# or
	#     ./minicompiler.py &gt; out.ll
	#     llvm-as out.ll
	#     lli out.bc
	# and to generate native code skip the lli above and then
	#     llc out.bc
	#     gcc out.s

	#     ./a.out

if __name__ == '__main__':
	main()
</pre>
<p>So the basic idea of code generation is:</p>
<ul>
<li>dispatch node X</li>
<li>dispatch all dependencies of X</li>
<li>emit code for X</li>
<li>and if applicable store the value of the computation in X for access by parent nodes</li>
</ul>
<h3>&nbsp;</h3>
<p>When you directly look at the output of the compiler you&#8217;ll see that all expressions are evaluated even before the code is executed. Since we are using only constants LLVM uses a constant folder to optimize these operations away. There is a &#8220;nofolder&#8221; implementation available in LLVM 2.4, but I don&#8217;t know how to access it through Python or the C wrapper.</p>
<p>As you have seen this compiler consists only of a single phase: Code generation. Try to add a lexer and parser to avoid hard coding the AST and post a comment with your solution.<br />
If you don&#8217;t want to use ANTLR or try something new I&#8217;d love to hear about your experiences with <a href="http://waxeye.org">waxeye</a>.</p>
<p>Read more here: <a href="http://code2code.wordpress.com/compiler-writing-series/">Compiler Writing Series</a></p>
<br />Posted in Compiler Writing, python Tagged: ast, compiler, llvm, llvm-py, python <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/249/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/249/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/249/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/249/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/249/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/249/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/249/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/249/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=249&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/11/29/code-generation/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Why Exoself</title>
		<link>http://code2code.wordpress.com/2008/11/26/why-exoself/</link>
		<comments>http://code2code.wordpress.com/2008/11/26/why-exoself/#comments</comments>
		<pubDate>Wed, 26 Nov 2008 13:07:20 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[exoself]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=188</guid>
		<description><![CDATA[Why am I doing the work of adding yet another programming language to the huge pool of already working languages? As a programmer I&#8217;m a tool developer. Why not create a tool that makes programming easier for myself? In my tool set the most interesting option for me is to write a new programming language [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=188&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Why am I doing the work of adding yet another programming language to the huge pool of already working languages? As a programmer I&#8217;m a tool developer. Why not create a tool that makes programming easier for myself? In my tool set the most interesting option for me is to write a new programming language thats a better match for my problems.</p>
<p>I don&#8217;t want to start a flame war here. Obviously it&#8217;s how I feel about the following languages. Take it with a grain of salt, it probably sounds much worse than it really is.</p>
<h3>Existing languages</h3>
<p>The core languages of my tool set are C, C++ and Python. Python when it&#8217;s not performance critical, C++ if performance matters and for kernel modules obviously C. Some other languages I&#8217;ve used are D, Javascript, Java, PHP. I&#8217;ve tried out many other languages, to name a few: Lua, Boo, Ruby, Haskell, Erlang, io and even x86 assembler.</p>
<p>So ideally I want a language that is as easy to use as Python but with the performance of C / C++. That&#8217;s not really possible since dynamic typing incurs a certain overhead. Using a JIT would be sometimes an option, but <a href="http://psyco.sourceforge.net/">Psyco</a> is not available for x86_64. I know there are many alternatives to Psyco such as <a href="http://shed-skin.blogspot.com">Shed Skin</a> and some other tools that allow me to embed C / C++ or even annotate types for static compilation. Still no real solution for me.</p>
<p>Why don&#8217;t I just extend Python with C++? When you have to access the raw data on both sides of the language barrier that&#8217;s bad. Copying it constantly both ways is just too expensive.</p>
<p>(There are designs were this is not an issue, like the application interface of <a href="http://horde3d.org">Horde3D</a> &#8211; you should really take a look at it; but trying to split the data processing and visualization implementation across the language barrier didn&#8217;t work out for me.)</p>
<p>A language which tries to combine the best features of static and dynamic typing is <a href="http://boo.codehaus.org">Boo</a>. In theory it would be my dream language since it&#8217;s a mix of Python and C++. The main problem again was performance. One part being <a href="http://www.mono-project.com">Mono</a> and another the implementation of Boo itself.</p>
<p>Without going into too much detail I don&#8217;t like Java with it&#8217;s very verbose style for even simple tasks and why did they forbid operator overloading?</p>
<p>Another very interesting language is <a href="http://www.digitalmars.com/d/">D</a>. Or I probably should say languages, since D 1.0 is &#8220;stable&#8221; and D 2.0 a moving target. I tried D 1.0 and realized that there are just to many quirks with it for me. I&#8217;ll pick a few examples: You have two, yes two, <em>standard</em> libraries. And why provide a unicode character type (a builtin) when it&#8217;s indices are not character indices but bytes?</p>
<p>Functional languages like Haskell and Erlang don&#8217;t suit my thinking model. The so called benefits of side effect free functions and pattern matching are my major deal breaker. Instead I&#8217;ll try out D 2.0, when it&#8217;s finally stable, to work with transitive const / invariant.</p>
<p>Regarding C / C++: These are old(er) languages and I see several problems with them. The module system is far from optimal as I have to maintain headers which should really be the job of the compiler. The type system in C is too weak and C++ has template programming with unimaginable complexity. Another problem with C++ are the very long compile times even without optimization for template heavy programs. Many many more reasons &#8211; but I still feel at home when using them as C++ was my first programming language.</p>
<p>In conclusion there is no perfect language for me. I don&#8217;t know if that will ever change since Exoself will certainly have some quirks too &#8211; but I at least have the option to make it a better fit.</p>
<h3>Other reasons</h3>
<p>The short version is that I&#8217;m fascinated by how computers work. Be it the inner workings of a CPU or even how a transistor works or on a higher level what happens during a &#8220;simple&#8221; <code>open</code> call to the glibc. Ever read the code from the headers right down to the syscall implementation? Or have you written a kernel module?</p>
<p>After learning about LLVM more than a year ago I constantly followed it&#8217;s progress but without any concrete plans to write my own language. But that changed as you can see.</p>
<p>So my primary motivation for writing a new programming language is having some fun constructing a compiler and runtime. That involves solving some complex problems, but that&#8217;s all the more intriguing. If anyone is interested in the results of this project, even better.</p>
<h3>Choosing a name</h3>
<p>The name originates in a book I&#8217;ve read: <a href="http://gregegan.customer.netspace.net.au/DIASPORA/DIASPORA.html">Diaspora, by Greg Egan</a>. There the term Exoself describes the systems that work together to extend mind (and body) but are not necessarily part of either. A programming language should exactly do that: Extend your mind &#8211; but is not part of it in any way.</p>
<p>Another reason for this name was that you can search for it using your favorite search engine. Try that with D or any other one letter language.</p>
<h3>Design goals / buzz word compliance</h3>
<ul>
<li>ease of use, productivity</li>
<li>garbage collected by default, but opt out by simply using C malloc</li>
<li>type inferencing for variables, type information available outside of compiler</li>
<li>statically typed, stronger typed than C++</li>
<li>performance similar to C</li>
<li>fast compile times</li>
<li>C style syntax, maybe later a mode for significant whitespace</li>
<li>must be able to call C libraries directly, ideally with no overhead</li>
<li>module system</li>
<li>multi core aware; no concrete ideas so far, but many options how it could work:
<ul>
<li><a href="http://www.cilk.com">Cilk++</a></li>
<li><a href="http://www.openmp.org">OpenMP</a></li>
<li><a href="http://www.threadingbuildingblocks.org">TBB</a></li>
<li><a href="http://digitalmars.com/d/final-const-invariant.html">transitive const / invariant</a></li>
</ul>
</li>
<li>make parsing easy enough, especially for IDE developers</li>
<li>no preprocessor</li>
</ul>
<p>I have some more ideas like a <a href="http://en.wikipedia.org/wiki/REPL">REPL</a> or compile time function evaluation.<br />
Regarding the type information outside the compiler: The compiler itself should be able to provide IDEs with type information. Without incremental parsing that&#8217;s not really fast but will have to do for now. It&#8217;s better than nothing and is IDE independant.</p>
<p>These design goals enforce several limitations to what Exoself will become. As an example the ability to call C directly comes at the cost of using the same base types as C, no actor concurrency like Erlang, no accurate or moving garbage collection and many more details.</p>
<h3>Programming model</h3>
<p>The perfect language would support an do whatever you want style. That&#8217;s not feasible to implement, so for now Exoself will be an imperative / procedural programming language similar to C. The main idea is to get a working programming language and adding objects makes this right now more complicated than is strictly necessary.</p>
<p>I&#8217;ll definitely add objects later. Maybe after a 1.0 release, maybe before. It really depends on what I learn about language design while writing Exoself. For now I&#8217;m only planning how to add objects to the language without influencing other parts of the language too much. Essentially this means the language will consist of two seamlessly cooperating parts. One for procedural programming, the other for object oriented programming. This is similar to D&#8217;s struct and class distinction but taken a step further.</p>
<p>The idea behind this split is that raw performance is only guaranteed by the procedural part. If you need it, it&#8217;s available for you. But if execution speed does not matter that much prefer ease of use. For many tasks it&#8217;s good enough to have an Integer class that supports arbitrary integers while being slow(er) instead of having a lightning fast int / long. Just look at Python &#8211; it really works!</p>
<h3>&nbsp;</h3>
<p>The documentation for Exoself will include a detailed language specification. But writing that takes much time and is not as interesting as writing code to implement new features. For now I have only many notes where I tried out different ideas and discuss things like effects of garbage collection on the ability to call C functions&#8230;</p>
<br />Posted in exoself Tagged: exoself <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/188/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/188/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/188/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/188/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/188/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/188/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/188/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/188/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=188&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/11/26/why-exoself/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>
	</item>
		<item>
		<title>Semantic analysis</title>
		<link>http://code2code.wordpress.com/2008/11/24/semantic-analysis/</link>
		<comments>http://code2code.wordpress.com/2008/11/24/semantic-analysis/#comments</comments>
		<pubDate>Mon, 24 Nov 2008 20:30:27 +0000</pubDate>
		<dc:creator>Codepoet</dc:creator>
				<category><![CDATA[Compiler Writing]]></category>
		<category><![CDATA[exoself]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[compiler]]></category>
		<category><![CDATA[ast]]></category>
		<category><![CDATA[type checking]]></category>
		<category><![CDATA[typedef]]></category>
		<category><![CDATA[alias]]></category>
		<category><![CDATA[semantic analysis]]></category>
		<category><![CDATA[type system]]></category>

		<guid isPermaLink="false">http://code2code.wordpress.com/?p=210</guid>
		<description><![CDATA[In the previous stages the Lexer and Parser rejected invalid inputs. That&#8217;s also the job of this phase: We want to find out if the program adheres to certain semantic rules &#8211; the meaning of language constructs. Let&#8217;s start simple. What is wrong with the following top level code? def f(x as int32) as int32 [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=210&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>In the previous stages the Lexer and Parser rejected invalid inputs. That&#8217;s also the job of this phase: We want to find out if the program adheres to certain semantic rules &#8211; the meaning of language constructs.</p>
<p>Let&#8217;s start simple. What is wrong with the following top level code?</p>
<pre class="brush: cpp;">
def f(x as int32) as int32
{
    if x == 0
    {
        break;
    }
    return 42;
}
</pre>
<p>What does the break statement there? It&#8217;s not contained in a loop (or switch) so by using it in this context we are violating the rule that a break must be associated with a loop. But nevertheless this code will parse without any problems.</p>
<p>Another important job of this phase, at least in statically compiled languages, is to gather type information for the code generator. Also the type checking is implemented in this phase.</p>
<pre class="brush: cpp;">
x as int32;
x = 4.0;
</pre>
<p>While traversing the AST we see that a symbol named x is defined as an int32. After inserting this information into a symbol table we continue with the next statement.</p>
<p>Now we should assign the value 4.0, which is of type float64, to the symbol x. After a lookup in the symbol table we know that x is of type int32. Now the type system has to decide if this is a valid language construct. In Exoself this construct is not valid: There are no implicit conversions from any floating point type to an integer type. As a result the type checker will emit an error.</p>
<p>So what we have done here is walking the AST in postorder: Before evaluating any node all its children must be evaluated. In Exoself after determining the type of an AST node I add the attribute &#8220;esType&#8221; to that node. So every parent node can easily access the type information.</p>
<p>Here&#8217;s the code I&#8217;m using for the while statement. A while statement consists of it&#8217;s own AST node (ast), the expression to evaluate (expression) and a block to execute. The expression must be of type bool or there must exist an implicit cast to bool.</p>
<pre class="brush: python;">
    def _onWhile(self, ast, expression, block):
        self._dispatch(expression)

        esType = expression.esType
        bool = self._findSymbol(name=u'bool', type_=ESType)
        if not esType.isEquivalentTo(bool, False):
            # types did not match, try implicit cast
            if not estypesystem.canImplicitlyCast(esType, bool):
                self._raiseException(RecoverableCompileError, tree=expression, inlineText='incompatible type, expected bool')
</pre>
<p>The first thing I&#8217;m doing is calling the correct handler for the expression using dispatch. Then I&#8217;m retrieving the type from the expression AST and look up the bool type in the symbol table.</p>
<p>The next step is more interesting: I&#8217;m trying to find out if the type of the expression is identical to bool. In this case I&#8217;m doing a name based comparison instead of a structure based comparison.</p>
<p>This check will very often fail since programmers are too lazy to cast the expression manually to bool in while statements. Therefore my code tries to insert an implicit cast to bool. If that fails we raise an exception which in turn will print an error message what exactly went wrong and why.</p>
<p>An example that is a bit more difficult is the assignment statement. Lets look at its code, too:</p>
<pre class="brush: python;">
    def _onAssignHelper(self, assigneeExpr, exprNode):
        self._dispatch(exprNode)
        esType = exprNode.esType

        # FIXME make assigneeExpr eval more general and move it to astwalker!
        if assigneeExpr.type == TreeType.VARIABLE:
            varNameNode = assigneeExpr.children[0]
            var = self._findSymbol(fromTree=varNameNode, type_=ESVariable, mayFail=True)

            if not var:
                # create new variable with type of expression
                var = ESVariable(varNameNode.text, esType)
                self._addSymbol(fromTree=varNameNode, symbol=var)
            else:
                if not var.esType.isEquivalentTo(esType, False):
                    self._insertImplicitCastNode(exprNode, var.esType)
        elif assigneeExpr.type == TreeType.DEREFERENCE:
            self._dispatch(assigneeExpr)

            if not assigneeExpr.esType.isEquivalentTo(esType, False):
                self._insertImplicitCastNode(exprNode, assigneeExpr.esType)
        else:
            print assigneeExpr.text
            raise NotImplementedError('TODO')
</pre>
<p>As you can see, still work in progress. (_onAssign, the real handler for simple assignments, is just a thin wrapper around this function to avoid code duplication)</p>
<p>I start again by determining the type of the expression. Then we have to be careful: Variables can be defined by assignment in Exoself, but this makes no sense for pointer expressions.</p>
<p>If the assignee is a variable we try to find the associated symbol, but that may fail. If the variable does not yet exist we add an entry to the symbol table with the type of the expression. As you can see type inferencing is trivial on this level and more languages should provide it. Otherwise, if the variable was already defined check if types match and if not try to insert an implicit cast. In case of the pointer expression just dispatch it to determine its type and compare it in the same manner.</p>
<h3>Type system requirements</h3>
<p>You now know how to type check &#8211; at least in principle. But how can wee determine if two types are the same? There are two ways to type checking ignoring any details.</p>
<p>The first way is to use name based equivalence. The types of two structs are only considered equal in C if they have the same name. The contents are completely ignored:</p>
<pre class="brush: cpp;">
struct X {int a};
struct Y {int b};
struct X x;
struct Y y;
x = y;// error, x not of type struct Y
</pre>
<p>The other way to compare data types is by their structure. Would C use structural equivalence for structs instead, the above assignment would be legal. That&#8217;s quite confusing since the type system would not prevent many errors. It&#8217;s important to understand that the names of variables do not matter for structural equivalence, only their types and order in structs.</p>
<p>All primitive data types of C are compared using structural equivalence. If you introduce a typedef for int called MyInt there&#8217;s no way to enforce anyone to use the type MyInt instead of a plain int. Let&#8217;s call this an alias.</p>
<p>Regarding the type system of Exoself, which is similar to that of <a href="http://digitalmars.com/d/">D</a>, I want to use name based equivalence by default but with the option to introduce an alias. The alias statement introduces another name which is equivalent to the old one (even with name based comparison) and the typedef statement introduces a new type which is only structurally equivalent to the old one.</p>
<p>Another requirement for the type system of Exoself was to be able to represent const / invariable data, even if I&#8217;m not using that in the near future. Pointers and structs make this again a bit complicated but there&#8217;s an elegant solution to this problem.</p>
<h3>Type representation</h3>
<p>The basic idea is to represent any type by a tree.</p>
<p><code>x as int32</code><br />
<a href="http://code2code.files.wordpress.com/2008/11/typerepr_int32.png"><img src="http://code2code.files.wordpress.com/2008/11/typerepr_int32.png?w=197&#038;h=61" alt="typerepr_int32" title="typerepr_int32" width="197" height="61" class="alignnone size-full wp-image-221" /></a></p>
<p><code>x as int32*</code><br />
<a href="http://code2code.files.wordpress.com/2008/11/typerepr_int32ptr.png"><img src="http://code2code.files.wordpress.com/2008/11/typerepr_int32ptr.png?w=192&#038;h=160" alt="typerepr_int32ptr" title="typerepr_int32ptr" width="192" height="160" class="alignnone size-full wp-image-222" /></a></p>
<p><code>alias int as int32</code><br />
<a href="http://code2code.files.wordpress.com/2008/11/typerepr_int32.png"><img src="http://code2code.files.wordpress.com/2008/11/typerepr_int32.png?w=197&#038;h=61" alt="typerepr_int32" title="typerepr_int32" width="197" height="61" class="alignnone size-full wp-image-221" /></a></p>
<p><code>typedef byte as uint8</code><br />
<a href="http://code2code.files.wordpress.com/2008/11/typerepr_byte.png"><img src="http://code2code.files.wordpress.com/2008/11/typerepr_byte.png?w=197&#038;h=160" alt="typerepr_byte" title="typerepr_byte" width="197" height="160" class="alignnone size-full wp-image-220" /></a><br />
By introducing the part with the typedef I&#8217;m breaking structural equivalence to the original type.</p>
<p><code>struct X {a as int32; b as float32}</code><br />
<a href="http://code2code.files.wordpress.com/2008/11/typerepr_struct.png"><img src="http://code2code.files.wordpress.com/2008/11/typerepr_struct.png?w=421&#038;h=160" alt="typerepr_struct" title="typerepr_struct" width="421" height="160" class="alignnone size-full wp-image-223" /></a><br />
Again the struct part breaks structural equivalence and order of the fields is of course relevant.</p>
<p>So we need to define a class which can store this relationship: On a first read you might want to skip over it.</p>
<pre class="brush: python;">
class ESType(object):
	''' represents types of data, not variables! '''

	def __init__(self, parents, payload):
		''' do not call directly! use construction methods '''
		assert(isinstance(parents, list))
		for x in parents:
			assert(isinstance(x, ESType))
		self.parents = parents
		self.payload = payload

	def derivePointer(self):
		return ESType([self], ('pointer', None))

	def dereference(self):
		assert(self.isPointer())

		return self.parents[0]

	def deriveTypedef(self, name):
		# break structural equivalence
		return ESType([self], ('typedef', name))

	@staticmethod
	def createStruct(name, parts):
		return ESType(parts, ('struct', name))

	@staticmethod
	def createFunction(returnTypes, paramTypes):
		assert(len(returnTypes) &gt;= 1)
		parts = []
		parts.extend(returnTypes)
		parts.extend(paramTypes)
		return ESType(parts, ('function', len(returnTypes)))

	@staticmethod
	def createSelfPointer():
		''' only valid inside structs! '''
		return ESType([], ('selfpointer', None))

	def isEquivalentTo(self, other, structural):
		# structural equivalence: Skip any typedefs and ignore different struct names

		t1 = self
		t2 = other

		if structural:
			while t1.payload[0] == 'typedef':
				assert(len(t1.parents) == 1)
				t1 = t1.parents[0]

			while t2.payload[0] == 'typedef':
				assert(len(t2.parents) == 1)
				t2 = t2.parents[0]

		if structural and t1.payload[0] == 'struct' and t2.payload[0] == 'struct':
			pass
		elif t1.payload != t2.payload:
			return False

		if len(t1.parents) != len(t2.parents):
			return False

		for i in range(len(t1.parents)):
			if not t1.parents[i].isEquivalentTo(t2.parents[i], structural):
				return False

		return True

	def __eq__(self, other):
		raise NotImplementedError('use isEquivalentTo')

	def __ne__(self, other):
		raise NotImplementedError('use isEquivalentTo')

	def isPointer(self):
		p = self
		while p.payload[0] == 'typedef':
			p = p.parents[0]

		return p.payload[0] == 'pointer'
</pre>
<p>Essentially I&#8217;ve defined a class that has two members. A ordered list of parents and a payload. The payload is a tuple exactly as in the images above denoting the type of the current node. The parents member is a list containing references to other ESType instances to represent the tree.</p>
<p>The deriveX methods can create new types from existing ones. By calling derivePointer on an existing instance a new instance of ESType will be created with the payload (&#8216;pointer&#8217;, None) and the parent of this new instance will be the current one. That way we can represent a pointer to an arbitrary type. That&#8217;s only almost correct as we&#8217;ll see in a moment.</p>
<p>Since structs need multiple parents I&#8217;ve added a static method to ESType to do this work for me. Just pass all elements in the order they should be defined. But what happens with structs containing pointers to themselves? That would create a loop in the tree and that&#8217;s not a good idea.</p>
<p>I&#8217;m using a trick here: I provide a special type for self pointers. These are valid only inside structs and get later translated to the real type.</p>
<p>Now checking if two types are equal is easy: Just compare the payloads and all parents recursively until something doesn&#8217;t match. Under certain circumstances you&#8217;ll still need structural equivalence. For example when assigning a variable of type byte a uint8 constant using an explicit cast. The cast will do nothing since the structures are equivalent. So while comparing just skip typedef nodes and anything containing names.</p>
<h3>more AST annotation</h3>
<p>Now we have an AST with type information but there are still some things missing. An important data structure that I already mentioned is the symbol table. In Exoself every pair of curly braces introduces a new scope. Variables defined inside a scope are not available outside a scope just as in C. Some other AST nodes like the module itself, function definitions and for loops also introduce a scope.</p>
<p>So all these AST nodes must have a symbol table. This could be as simple as a dict mapping symbol names to instances of variable / function classes. But the lookup of symbols is a bit more complicated. You have to search all symbol tables recursively.</p>
<p>My idea to solve this problem was to maintain a list of all parents of the current node. By traversing through this list starting at the end we move with every step towards the root node of the AST. Using the <code>hasattr</code> function in Python we can simply check if the current AST node has a symbol table or not. If there is one, search it for the requested symbol. If the symbol was found we can stop the search and return the result, otherwise continue.</p>
<p>Adding symbols is done by searching for the nearest symbol table. Additionally Exoself forbids shadowing of variables, so an additional search is done to check if that name is already used.</p>
<h3>&nbsp;</h3>
<p>In the next post I&#8217;ll describe the code generation phase using LLVM and llvm-py.</p>
<p>Read more here: <a href="http://code2code.wordpress.com/compiler-writing-series/">Compiler Writing Series</a></p>
<br />Posted in Compiler Writing, exoself, python Tagged: alias, ast, compiler, exoself, python, semantic analysis, type checking, type system, typedef <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/code2code.wordpress.com/210/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/code2code.wordpress.com/210/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/code2code.wordpress.com/210/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/code2code.wordpress.com/210/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/code2code.wordpress.com/210/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/code2code.wordpress.com/210/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/code2code.wordpress.com/210/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/code2code.wordpress.com/210/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=code2code.wordpress.com&amp;blog=5522870&amp;post=210&amp;subd=code2code&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://code2code.wordpress.com/2008/11/24/semantic-analysis/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="" medium="image">
			<media:title type="html">Codepoet</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2008/11/typerepr_int32.png" medium="image">
			<media:title type="html">typerepr_int32</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2008/11/typerepr_int32ptr.png" medium="image">
			<media:title type="html">typerepr_int32ptr</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2008/11/typerepr_int32.png" medium="image">
			<media:title type="html">typerepr_int32</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2008/11/typerepr_byte.png" medium="image">
			<media:title type="html">typerepr_byte</media:title>
		</media:content>

		<media:content url="http://code2code.files.wordpress.com/2008/11/typerepr_struct.png" medium="image">
			<media:title type="html">typerepr_struct</media:title>
		</media:content>
	</item>
	</channel>
</rss>
