首页
社区
课程
招聘
[原创]入门编译原理之前端对接LLVM IR
发表于: 2023-9-5 21:30 5553

[原创]入门编译原理之前端对接LLVM IR

2023-9-5 21:30
5553

接上篇[原创]入门编译原理之前端体验,本篇将前端解析到的AST对接到LLVM IR。本文目标明确,就直贴代码了。

输出结果:
图片描述

下篇就是将IR转成汇编语言了... ...

import llvmlite.ir as ir
import llvmlite.binding as llvm
 
# 初始化LLVM
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
 
# 创建LLVM模块
module = ir.Module(name="my_module")
# 创建一个LLVM上下文和函数
context = ir.Context()
function = None
 
# 创建LLVM IR构建器
builder = None
 
# 创建一个LLVM上下文和函数
# context = llvm.getGlobalContext()
# function = None
 
 
# Token 类用于表示词法分析器生成的令牌
class Token:
    def __init__(self, type, value=None):
        self.type = type
        self.value = value
 
# Lexer 类负责将输入文本解析成令牌流
class Lexer:
    def __init__(self, text):
        self.text = text  # 要解析的文本
        self.pos = 0      # 当前解析位置
 
    # 获取下一个令牌
    def get_next_token(self):
        if self.pos >= len(self.text):
            return Token("EOF"# 如果已经到达文本末尾,返回一个表示结束的令牌
         
        current_char = self.text[self.pos]
 
        # 如果当前字符是字母,则解析标识符
        if current_char.isalpha():
            identifier = ""
            while self.pos < len(self.text) and self.text[self.pos].isalnum():
                identifier += self.text[self.pos]
                self.pos += 1
            return Token("IDENTIFIER", identifier)
 
        # 如果当前字符是数字,则解析数字
        if current_char.isdigit():
            self.pos += 1
            return Token("NUMBER", int(current_char))
     
        # 如果当前字符是运算符,则解析运算符
        if current_char in "+-*/":
            self.pos += 1
            return Token("OPERATOR", current_char)
 
        # 如果当前字符是等号,则解析为赋值符号
        if current_char == "=":
            self.pos += 1
            return Token("ASSIGN", "=")
 
        # 如果当前字符是分号,则解析为分号
        if current_char == ";":
            self.pos += 1
            return Token("SEMICOLON", ";")
 
        # 如果当前字符是左括号,则解析为左括号
        if current_char == "(":
            self.pos += 1
            return Token("LPAREN", "(")
 
        # 如果当前字符是右括号,则解析为右括号
        if current_char == ")":
            self.pos += 1
            return Token("RPAREN", ")")
 
        # 如果当前字符是空格或制表符,则忽略并获取下一个令牌
        if current_char in " \t":
            self.pos += 1
            return self.get_next_token()
 
        raise ValueError("Invalid character"# 如果遇到无法识别的字符,引发异常
 
# Parser 类负责解析令牌流并计算结果
class Parser:
    def __init__(self, lexer):
        self.lexer = lexer  # 词法分析器
        self.current_token = self.lexer.get_next_token()  # 当前令牌
        self.variables = {}  # 存储变量名和值的字典
 
    # 解析整个表达式并生成LLVM IR
    def parse(self):
        global builder, function
        results = []
         
        # 创建主函数
        main_function_type = ir.FunctionType(ir.IntType(32), ())
        function = ir.Function(module, main_function_type, name="main")
        block = function.append_basic_block(name="entry")
        builder = ir.IRBuilder(block)
 
        while self.current_token.type != "EOF":
            result = self.parse_statement()
            results.append(result)
            if self.current_token.type == "SEMICOLON":
                self.eat("SEMICOLON")
 
        builder.ret(results[-1])
        return results
 
    # 解析表达式并生成LLVM IR
    def parse_expression(self, min_precedence=0):
        left = self.parse_atom()
 
        while self.current_token.type == "OPERATOR" and self.precedence(self.current_token.value) >= min_precedence:
            operator = self.current_token.value
            self.eat("OPERATOR")
            right = self.parse_expression(self.precedence(operator) + 1)
             
            if operator == "+":
                result = builder.add(left, right, name="addtmp")
            elif operator == "-":
                result = builder.sub(left, right, name="subtmp")
            elif operator == "*":
                result = builder.mul(left, right, name="multmp")
            elif operator == "/":
                result = builder.sdiv(left, right, name="divtmp")
            left = result
 
        return left
 
    # 解析原子表达式并生成LLVM IR
    def parse_atom(self):
        if self.current_token.type == "NUMBER":
            value = self.current_token.value
            self.eat("NUMBER")
            return ir.Constant(ir.IntType(32), value)
        elif self.current_token.type == "IDENTIFIER":
            variable_name = self.current_token.value
            self.eat("IDENTIFIER")
            if variable_name in self.variables:
                return self.variables[variable_name]
            else:
                raise ValueError(f"Undefined variable: {variable_name}")
        elif self.current_token.type == "LPAREN":
            self.eat("LPAREN")
            expression = self.parse_expression()
            self.eat("RPAREN")
            return expression
        else:
            raise ValueError("Invalid syntax")
 
    def parse_statement(self):
        if self.current_token.type == "IDENTIFIER":
            variable_name = self.current_token.value
            self.eat("IDENTIFIER")
            self.eat("ASSIGN")
            expression_value = self.parse_expression()
            self.eat("SEMICOLON")
            self.variables[variable_name] = expression_value
            return expression_value
        else:
            return self.parse_expression()
 
    def eat(self, token_type):
        if self.current_token.type == token_type:
            self.current_token = self.lexer.get_next_token()
        else:
            raise ValueError("Unexpected token")
 
    def precedence(self, operator):
        precedence = {"+": 1, "-": 1, "*": 2, "/": 2}
        return precedence.get(operator, 0)
 
    def apply_operator(self, left, operator, right):
        if operator == "+":
            return left + right
        elif operator == "-":
            return left - right
        elif operator == "*":
            return left * right
        elif operator == "/":
            return left / right
 
# 计算函数,接受一个表达式并返回计算结果
def calculate(expression):
    lexer = Lexer(expression)
    parser = Parser(lexer)
    results = parser.parse()
 
    # 使用 str 方法将 LLVM IR 内部表示转换为格式化的LLVM IR 汇编代码
    formatted_ir = str(module)
    # 打印格式化的 LLVM IR 汇编代码
    # print(formatted_ir)
 
    return results, formatted_ir
 
'''
# 测试代码
expression = "x = 3 * (4-1); y = 2*x + 2;"
# expression = "7"
results = calculate(expression)
print(results)  # 输出结果为 [12, 14, None]
'''
# 测试代码
expression = "x = 3 * (4-1); y = 2*x + 2;"
llvm_ir = calculate(expression)
print("\n------------------------------------------")
print("LLVM IR的内部表示:\n", llvm_ir[0])
print("\n------------------------------------------")
print("LLVM IR的内部表示格式化和排版的LLVM IR汇编代码:\n", llvm_ir[1])
import llvmlite.ir as ir
import llvmlite.binding as llvm
 
# 初始化LLVM
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
 
# 创建LLVM模块
module = ir.Module(name="my_module")
# 创建一个LLVM上下文和函数
context = ir.Context()
function = None
 
# 创建LLVM IR构建器
builder = None
 
# 创建一个LLVM上下文和函数
# context = llvm.getGlobalContext()
# function = None
 
 
# Token 类用于表示词法分析器生成的令牌
class Token:
    def __init__(self, type, value=None):
        self.type = type
        self.value = value
 
# Lexer 类负责将输入文本解析成令牌流
class Lexer:
    def __init__(self, text):
        self.text = text  # 要解析的文本
        self.pos = 0      # 当前解析位置
 
    # 获取下一个令牌
    def get_next_token(self):
        if self.pos >= len(self.text):
            return Token("EOF"# 如果已经到达文本末尾,返回一个表示结束的令牌
         
        current_char = self.text[self.pos]
 
        # 如果当前字符是字母,则解析标识符
        if current_char.isalpha():
            identifier = ""
            while self.pos < len(self.text) and self.text[self.pos].isalnum():
                identifier += self.text[self.pos]
                self.pos += 1
            return Token("IDENTIFIER", identifier)
 
        # 如果当前字符是数字,则解析数字
        if current_char.isdigit():
            self.pos += 1
            return Token("NUMBER", int(current_char))
     
        # 如果当前字符是运算符,则解析运算符
        if current_char in "+-*/":
            self.pos += 1
            return Token("OPERATOR", current_char)
 
        # 如果当前字符是等号,则解析为赋值符号
        if current_char == "=":
            self.pos += 1
            return Token("ASSIGN", "=")
 
        # 如果当前字符是分号,则解析为分号
        if current_char == ";":
            self.pos += 1
            return Token("SEMICOLON", ";")
 
        # 如果当前字符是左括号,则解析为左括号
        if current_char == "(":
            self.pos += 1
            return Token("LPAREN", "(")
 
        # 如果当前字符是右括号,则解析为右括号
        if current_char == ")":
            self.pos += 1
            return Token("RPAREN", ")")
 
        # 如果当前字符是空格或制表符,则忽略并获取下一个令牌
        if current_char in " \t":
            self.pos += 1
            return self.get_next_token()
 
        raise ValueError("Invalid character"# 如果遇到无法识别的字符,引发异常
 
# Parser 类负责解析令牌流并计算结果
class Parser:
    def __init__(self, lexer):
        self.lexer = lexer  # 词法分析器
        self.current_token = self.lexer.get_next_token()  # 当前令牌
        self.variables = {}  # 存储变量名和值的字典
 
    # 解析整个表达式并生成LLVM IR
    def parse(self):
        global builder, function
        results = []
         
        # 创建主函数
        main_function_type = ir.FunctionType(ir.IntType(32), ())
        function = ir.Function(module, main_function_type, name="main")
        block = function.append_basic_block(name="entry")
        builder = ir.IRBuilder(block)
 
        while self.current_token.type != "EOF":
            result = self.parse_statement()

[培训]内核驱动高级班,冲击BAT一流互联网大厂工作,每周日13:00-18:00直播授课

最后于 2023-9-5 21:37 被_THINCT编辑 ,原因:
收藏
免费 1
支持
分享
最新回复 (2)
雪    币: 2948
活跃值: (30846)
能力值: ( LV2,RANK:10 )
在线值:
发帖
回帖
粉丝
2
感谢分享
2023-9-6 09:24
1
雪    币:
能力值: ( LV1,RANK:0 )
在线值:
发帖
回帖
粉丝
3
这个代码用什么编译器写?
2024-3-7 08:15
0
游客
登录 | 注册 方可回帖
返回
//