commit a4ef374f4a674d9780c4f41f7d085b08a723d5c4 Author: James Harton Date: Thu Mar 5 14:22:23 2015 -0800 First post :+1: diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9607671 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/_build +/deps +erl_crash.dump +*.ez diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc15b27 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# LexLuthor + +LexLuthor is a Lexer in Elixir (say that 10 times fast) which uses macros to generate a reusable lexers. Good times. + +LexLuthor is a state based lexer, meaning that it keeps a state stack which you can push states on and pop states off the stack, which are used to filter the applicable rules for a given state. For example: + +```elixir +defmodule StringLexer do + use LexLuthor + + defrule ~r/^'/, fn(_) -> :STRING end + defrule ~r/^[^']+/, :STRING, fn(e) -> { :string, e } end + defrule ~r/^'/, :STRING, fn(_) -> nil end +end +``` + +Rules are defined by a regular expression, an optional state (as an atom) and an action in the form of an anonymous function. + +When passed the string `'foo'`, the lexer starts in the `:default` state, so it filters for rules in the default state (the first rule, as it doesn't specify a state), then it filters the available rules by the longest matching regular expression. In this case, since we have only one rule (which happens to match) it's automatically the longest match. + +Once the longest match is found, then it's action is executed and the return value matched: + - If the return value is a single atom then that atom is assumed to be a state and push onto the top of the state stack. + - If the return value is a two element tuple then the first element is expected to be an atom (the token name) and the second element a value for this token. + - If the return value is `nil` then the top state is popped off the state stack. + +If lexing succeeds then you will receive an `:ok` tuple with the second value being a list of `LexLuthor.Token` structs. + +If lexing fails then you will receive an `:error` tuple which a reason and position. + +## Contributing + +1. Fork it ( https://github.com/jamesotron/lex_luthor/fork ) +2. Create your feature branch (`git checkout -b my-new-feature`) +3. Commit your changes (`git commit -am 'Add some feature'`) +4. Push to the branch (`git push origin my-new-feature`) +5. Create a new Pull Request diff --git a/config/config.exs b/config/config.exs new file mode 100644 index 0000000..6dfa82f --- /dev/null +++ b/config/config.exs @@ -0,0 +1,24 @@ +# This file is responsible for configuring your application +# and its dependencies with the aid of the Mix.Config module. +use Mix.Config + +# This configuration is loaded before any dependency and is restricted +# to this project. If another project depends on this project, this +# file won't be loaded nor affect the parent project. For this reason, +# if you want to provide default values for your application for third- +# party users, it should be done in your mix.exs file. + +# Sample configuration: +# +# config :logger, :console, +# level: :info, +# format: "$date $time [$level] $metadata$message\n", +# metadata: [:user_id] + +# It is also possible to import configuration files, relative to this +# directory. For example, you can emulate configuration per environment +# by uncommenting the line below and defining dev.exs, test.exs and such. +# Configuration from the imported file will override the ones defined +# here (which is why it is important to import them last). +# +# import_config "#{Mix.env}.exs" diff --git a/lib/lex_luthor.ex b/lib/lex_luthor.ex new file mode 100644 index 0000000..051d1fd --- /dev/null +++ b/lib/lex_luthor.ex @@ -0,0 +1,130 @@ +defmodule LexLuthor do + + @rules [] + @action_no 0 + + defmodule State do + defstruct pos: 0, states: [nil], tokens: [] + end + + defmodule Token do + defstruct pos: 0, name: nil, value: nil + end + + defmacro __using__(_opts) do + quote do + @rules [] + import LexLuthor + @before_compile LexLuthor + end + end + + defmacro __before_compile__(_env) do + quote do + def lex string do + LexLuthor.lex __MODULE__, @rules, string + end + end + end + + defmacro defrule(regex, state, block) do + function_name = "_action_#{inspect(regex)}_#{Atom.to_string state}" |> String.to_atom + quote do + def unquote(function_name)(e) do + unquote(block).(e) + end + + @rules(@rules ++ [{ unquote(state), unquote(regex), unquote(function_name) }]) + { :ok, Enum.count(@rules) } + end + end + + defmacro defrule(regex, block) do + quote do + defrule unquote(regex), :default, unquote(block) + end + end + + def lex module, rules, string do + do_lex module, rules, string, %State{} + end + + defp do_lex module, rules, string, lexer do + [ current_state | _rest ] = lexer.states + + # Find the longest matching rule. This could + # probably be made a whole lot less enumeratey. + match = rules_for_state(rules, current_state) + |> matching_rules(string) + |> apply_matches(string) + |> longest_match_first + |> Enum.at(0) + + # Execute the matches' action. + {len, value, fun} = match + result = apply(module, fun, [value]) + + # Modify the lexer state as needed. + cond do + is_nil(result) -> + lexer = pop_state lexer + is_atom(result) -> + lexer = push_state lexer, result + { _token, _value } = result -> + lexer = push_token lexer, result + end + + # Increment lexer position + lexer = %State{ pos: lexer.pos + len, states: lexer.states, tokens: lexer.tokens } + + # Are we at the end of the string? + if String.length(string) == len do + Enum.reverse lexer.tokens + else + { _ , new_string } = String.split_at string, len + do_lex module, rules, new_string, lexer + end + end + + defp push_token lexer, token do + { tname, tvalue } = token + token = %Token{ pos: lexer.pos, name: tname, value: tvalue } + %State{ pos: lexer.pos, states: lexer.states, tokens: [ token | lexer.tokens ] } + end + + defp push_state lexer, state do + %State{ pos: lexer.pos, states: [ state | lexer.states ], tokens: lexer.tokens } + end + + defp pop_state lexer do + [ _ | states ] = lexer.states + %State{ pos: lexer.pos, states: states, tokens: lexer.tokens } + end + + defp rules_for_state rules, state do + Enum.filter rules, fn({rule_state,_,_})-> + if is_nil(state) do + state = :default + end + state == rule_state + end + end + + defp matching_rules rules, string do + Enum.filter rules, fn({_,regex,_})-> + Regex.match?(regex, string) + end + end + + defp apply_matches rules, string do + Enum.map rules, fn({_,regex,fun})-> + [match] = Regex.run(regex,string, capture: :first) + { String.length(match), match, fun } + end + end + + defp longest_match_first matches do + Enum.sort_by matches, fn({len,_,_})-> len end, &>=/2 + end + +end diff --git a/mix.exs b/mix.exs new file mode 100644 index 0000000..1f4e257 --- /dev/null +++ b/mix.exs @@ -0,0 +1,30 @@ +defmodule LexLuthor.Mixfile do + use Mix.Project + + def project do + [app: :lex_luthor, + version: "0.0.1", + elixir: "~> 1.0", + deps: deps] + end + + # Configuration for the OTP application + # + # Type `mix help compile.app` for more information + def application do + [applications: [:logger]] + end + + # Dependencies can be Hex packages: + # + # {:mydep, "~> 0.3.0"} + # + # Or git/path repositories: + # + # {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"} + # + # Type `mix help deps` for more examples and options + defp deps do + [] + end +end diff --git a/test/lex_luthor_test.exs b/test/lex_luthor_test.exs new file mode 100644 index 0000000..f11bc85 --- /dev/null +++ b/test/lex_luthor_test.exs @@ -0,0 +1,33 @@ +defmodule LexLuthorTest do + import TestHelpers + use ExUnit.Case, async: true + + @tests [ + { "''", generate_token(:simple_string, "") }, + { "'hello'", generate_token(:simple_string, "hello") }, + { "\"\"", generate_token(:string, "") }, + { "\"hello\"", generate_token(:string, "hello") }, + { "0", generate_token(:integer, 0) }, + { "123", generate_token(:integer, 123) }, + { "0x123", generate_token(:integer, 291) }, + { "0b1011", generate_token(:integer, 11) }, + { "0.0", generate_token(:float, 0.0) }, + { "123.456", generate_token(:float, 123.456) } + ] + + Enum.each @tests, fn + { source, token } -> + tname = Map.get token, :name + tvalue = Map.get token, :value + + test "String #{inspect(source)} results in token #{inspect(token)}" do + result = Enum.at(ExampleLexer.lex(unquote(source)), 0) + + rname = Map.get result, :name + rvalue = Map.get result, :value + assert rname == unquote(tname) + assert rvalue == unquote(tvalue) + end + end + +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..582c47e --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1,46 @@ +defmodule TestHelpers do + def generate_token(name, value) do + %LexLuthor.Token{name: name, value: value} + end +end + +defmodule ExampleLexer do + use LexLuthor + + # single tick strings + defrule ~r/^''/, fn(_) -> { :simple_string, "" } end + defrule ~r/^'/, fn(_) -> :simple_string end + defrule ~r/^[^']+/, :simple_string, fn(e) -> { :simple_string, e } end + defrule ~r/^'/, :simple_string, fn(_) -> nil end + + # double tick strings + defrule ~r/^""/, fn(_) -> { :string, "" } end + defrule ~r/^"/, fn(_) -> :string end + defrule ~R/^#{/, :string, fn(_) -> :default end + defrule ~R/^}/, :default, fn(_) -> nil end + defrule ~R/^[^("|#{)]+/, :string, fn(e) -> { :string, e } end + defrule ~r/^"/, :string, fn(_) -> nil end + + # floats + defrule ~r/^[0-9]+\.[0-9]+/, fn(e) -> { :float, String.to_float(e) } end + + # integers + defrule ~r/^0x[0-9a-fA-F]+/, fn(e) -> + [ _ | i ] = String.split e, "x" + { :integer, String.to_integer(Enum.at(i, 0), 16) } + end + defrule ~r/^0b[01]+/, fn(e) -> + [ _ | i ] = String.split e, "b" + { :integer, String.to_integer(Enum.at(i, 0), 2) } + end + defrule ~r/^[1-9][0-9]*/, fn(e) -> { :integer, String.to_integer(e) } end + defrule ~r/^0/, fn(_) -> { :integer, 0 } end + + # white space + defrule ~r/^[ \t]+/, fn(e) -> { :ws, String.length(e) } end + defrule ~r/^\r\n/, fn(_) -> { :nl, 1 } end + defrule ~r/^\r/, fn(_) -> { :nl, 1 } end + defrule ~r/^\n/, fn(_) -> { :nl, 1 } end +end + +ExUnit.start()