Documentation, test and readability improvements.
* Split out different concerns of LexLuthor into different modules. * Add `inch` and documentation. * Add `credo` and fix warnings.
This commit is contained in:
parent
51c691d8d3
commit
1b964adac3
|
@ -1,11 +1,33 @@
|
||||||
defmodule LexLuthor do
|
defmodule LexLuthor do
|
||||||
defmodule State do
|
alias LexLuthor.Runner
|
||||||
defstruct pos: 0, line: 1, column: 0, states: [nil], tokens: []
|
|
||||||
end
|
|
||||||
|
|
||||||
defmodule Token do
|
@moduledoc """
|
||||||
defstruct pos: 0, line: 1, column: 0, name: nil, value: nil
|
LexLuthor is a Lexer in Elixir (say that 10 times fast) which uses macros to generate a reusable lexers. Good times.
|
||||||
end
|
|
||||||
|
LexLuthor is a state based lexer, meaning that it keeps a state stack which you can push states on and pop states off the stack, which are used to filter the applicable rules for a given state. For example:
|
||||||
|
|
||||||
|
iex> defmodule StringLexer do
|
||||||
|
...> use LexLuthor
|
||||||
|
...> defrule ~r/^'/, fn(_) -> :STRING end
|
||||||
|
...> defrule ~r/^[^']+/, :STRING, fn(e) -> { :string, e } end
|
||||||
|
...> defrule ~r/^'/, :STRING, fn(_) -> nil end
|
||||||
|
...> end
|
||||||
|
...> StringLexer.lex("'foo'")
|
||||||
|
{:ok, [%LexLuthor.Token{column: 1, line: 1, name: :string, pos: 1, value: "foo"}]}
|
||||||
|
|
||||||
|
Rules are defined by a regular expression, an optional state (as an atom) and an action in the form of an anonymous function.
|
||||||
|
|
||||||
|
When passed the string `'foo'`, the lexer starts in the `:default` state, so it filters for rules in the default state (the first rule, as it doesn't specify a state), then it filters the available rules by the longest matching regular expression. In this case, since we have only one rule (which happens to match) it's automatically the longest match.
|
||||||
|
|
||||||
|
Once the longest match is found, then it's action is executed and the return value matched:
|
||||||
|
- If the return value is a single atom then that atom is assumed to be a state and push onto the top of the state stack.
|
||||||
|
- If the return value is a two element tuple then the first element is expected to be an atom (the token name) and the second element a value for this token.
|
||||||
|
- If the return value is `nil` then the top state is popped off the state stack.
|
||||||
|
|
||||||
|
If lexing succeeds then you will receive an `:ok` tuple with the second value being a list of `LexLuthor.Token` structs.
|
||||||
|
|
||||||
|
If lexing fails then you will receive an `:error` tuple which a reason and position.
|
||||||
|
"""
|
||||||
|
|
||||||
defmacro __using__(_opts) do
|
defmacro __using__(_opts) do
|
||||||
quote do
|
quote do
|
||||||
|
@ -19,164 +41,46 @@ defmodule LexLuthor do
|
||||||
defmacro __before_compile__(_env) do
|
defmacro __before_compile__(_env) do
|
||||||
quote do
|
quote do
|
||||||
def lex string do
|
def lex string do
|
||||||
LexLuthor.lex __MODULE__, @rules, string
|
Runner.lex __MODULE__, @rules, string
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defmacro defrule(regex, state, block) do
|
@doc """
|
||||||
|
Define a lexing rule for a specific state.
|
||||||
|
|
||||||
|
- `regex` a regular expression for matching against the input string.
|
||||||
|
- `state` the lexer state in which this rule applies.
|
||||||
|
- `action` the function to execute when this rule is applied.
|
||||||
|
"""
|
||||||
|
@spec defrule(Regex.t, atom, (String.t -> atom | nil | {atom, any})) :: {:ok, non_neg_integer}
|
||||||
|
defmacro defrule(regex, state, action) do
|
||||||
quote do
|
quote do
|
||||||
@action_counter(@action_counter + 1)
|
@action_counter(@action_counter + 1)
|
||||||
action_name = "_action_#{@action_counter}" |> String.to_atom
|
action_name = "_action_#{@action_counter}" |> String.to_atom
|
||||||
block = unquote(Macro.escape(block))
|
action = unquote(Macro.escape(action))
|
||||||
|
|
||||||
defaction = quote do
|
defaction = quote do
|
||||||
def unquote(Macro.escape(action_name))(e) do
|
def unquote(Macro.escape(action_name))(e) do
|
||||||
unquote(block).(e)
|
unquote(action).(e)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
Module.eval_quoted __MODULE__, defaction
|
Module.eval_quoted __MODULE__, defaction
|
||||||
|
|
||||||
@rules(@rules ++ [{ unquote(state), unquote(regex), action_name }])
|
@rules(@rules ++ [{unquote(state), unquote(regex), action_name}])
|
||||||
{ :ok, Enum.count(@rules) }
|
{:ok, Enum.count(@rules)}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defmacro defrule(regex, block) do
|
@doc """
|
||||||
|
Define a lexing rule applicable to the default state.
|
||||||
|
|
||||||
|
- `regex` a regular expression for matching against the input string.
|
||||||
|
- `action` the function to execute when this rule is applied.
|
||||||
|
"""
|
||||||
|
defmacro defrule(regex, action) do
|
||||||
quote do
|
quote do
|
||||||
defrule unquote(regex), :default, unquote(block)
|
defrule unquote(regex), :default, unquote(action)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def lex module, rules, string do
|
|
||||||
do_lex module, rules, string, %State{}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp do_lex module, rules, string, lexer do
|
|
||||||
[ current_state | _rest ] = lexer.states
|
|
||||||
|
|
||||||
# Find the longest matching rule. This could
|
|
||||||
# probably be made a whole lot less enumeratey.
|
|
||||||
matches = rules_for_state(rules, current_state)
|
|
||||||
|> matching_rules(string)
|
|
||||||
|> apply_matches(string)
|
|
||||||
|> longest_match_first
|
|
||||||
|
|
||||||
process_matches module, rules, matches, string, lexer, Enum.count(matches)
|
|
||||||
end
|
|
||||||
|
|
||||||
defp process_matches(_, _, _, string, _, count) when count == 0 do
|
|
||||||
{ :error, "String not in language: #{inspect string}"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp process_matches(module, rules, matches, string, lexer, count) when count > 0 do
|
|
||||||
match = Enum.at matches, 0
|
|
||||||
|
|
||||||
# Execute the matches' action.
|
|
||||||
{len, value, fun} = match
|
|
||||||
result = apply(module, fun, [value])
|
|
||||||
|
|
||||||
lexer = process_result result, lexer
|
|
||||||
|
|
||||||
case lexer do
|
|
||||||
{ :error, _ } ->
|
|
||||||
lexer
|
|
||||||
_ ->
|
|
||||||
|
|
||||||
fragment = String.slice string, 0, len
|
|
||||||
line = lexer.line + line_number_incrementor fragment
|
|
||||||
column = column_number lexer, fragment
|
|
||||||
|
|
||||||
lexer = Map.merge lexer, %{pos: lexer.pos + len, line: line, column: column}
|
|
||||||
|
|
||||||
# Are we at the end of the string?
|
|
||||||
if String.length(string) == len do
|
|
||||||
{ :ok, Enum.reverse lexer.tokens }
|
|
||||||
else
|
|
||||||
{ _ , new_string } = String.split_at string, len
|
|
||||||
do_lex module, rules, new_string, lexer
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp column_number lexer, match do
|
|
||||||
case Regex.match? ~r/[\r\n]/, match do
|
|
||||||
true ->
|
|
||||||
len = match |> split_on_newlines |> List.last |> String.length
|
|
||||||
case len do
|
|
||||||
0 -> 1
|
|
||||||
_ -> len
|
|
||||||
end
|
|
||||||
false ->
|
|
||||||
lexer.column + String.length match
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp line_number_incrementor match do
|
|
||||||
(match |> split_on_newlines |> Enum.count) - 1
|
|
||||||
end
|
|
||||||
|
|
||||||
def split_on_newlines string do
|
|
||||||
string |> String.split(~r{(\r|\n|\r\n)})
|
|
||||||
end
|
|
||||||
|
|
||||||
defp process_result(result, lexer) when is_nil(result) do
|
|
||||||
pop_state lexer
|
|
||||||
end
|
|
||||||
|
|
||||||
defp process_result(result, lexer) when is_atom(result) do
|
|
||||||
push_state lexer, result
|
|
||||||
end
|
|
||||||
|
|
||||||
defp process_result(result, lexer) when is_tuple(result) do
|
|
||||||
push_token lexer, result
|
|
||||||
end
|
|
||||||
|
|
||||||
defp process_result result, _ do
|
|
||||||
{ :error, "Invalid result from action: #{inspect result}"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp push_token lexer, token do
|
|
||||||
{ tname, tvalue } = token
|
|
||||||
token = %Token{ pos: lexer.pos, line: lexer.line, column: lexer.column, name: tname, value: tvalue }
|
|
||||||
Map.merge lexer, %{tokens: [token | lexer.tokens ]}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp push_state lexer, state do
|
|
||||||
Map.merge lexer, %{states: [state | lexer.states ]}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp pop_state lexer do
|
|
||||||
[ _ | states ] = lexer.states
|
|
||||||
Map.merge lexer, %{states: states}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp rules_for_state rules, state do
|
|
||||||
Enum.filter rules, fn({rule_state,_,_})->
|
|
||||||
state = if is_nil(state) do
|
|
||||||
:default
|
|
||||||
else
|
|
||||||
state
|
|
||||||
end
|
|
||||||
state == rule_state
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp matching_rules rules, string do
|
|
||||||
Enum.filter rules, fn({_,regex,_})->
|
|
||||||
Regex.match?(regex, string)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp apply_matches rules, string do
|
|
||||||
Enum.map rules, fn({_,regex,fun})->
|
|
||||||
[match] = Regex.run(regex,string, capture: :first)
|
|
||||||
{ String.length(match), match, fun }
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp longest_match_first matches do
|
|
||||||
Enum.sort_by matches, fn({len,_,_})-> len end, &>=/2
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
156
lib/lex_luthor/runner.ex
Normal file
156
lib/lex_luthor/runner.ex
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
defmodule LexLuthor.Runner do
|
||||||
|
alias LexLuthor.{State, Token}
|
||||||
|
|
||||||
|
@moduledoc """
|
||||||
|
This module runs a Lexer module against an input string.
|
||||||
|
|
||||||
|
You don't use it directly as `YourModule.lex/1` is defined on
|
||||||
|
your module when you `use LexLuthor`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Process a string against a given Lexer module and rules.
|
||||||
|
|
||||||
|
- `module` the module in which the lexer is defined.
|
||||||
|
- `rules` an array of rules to apply to the input string.
|
||||||
|
- `string` the input string to be lexed.
|
||||||
|
"""
|
||||||
|
@spec lex(atom, [{atom, Regex.t, String.t}], String.t) :: {:ok, non_neg_integer}
|
||||||
|
def lex module, rules, string do
|
||||||
|
do_lex module, rules, string, %State{}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp do_lex module, rules, string, lexer do
|
||||||
|
[current_state | _rest] = lexer.states
|
||||||
|
|
||||||
|
# Find the longest matching rule. This could
|
||||||
|
# probably be made a whole lot less enumeratey.
|
||||||
|
matches = rules
|
||||||
|
|> rules_for_state(current_state)
|
||||||
|
|> matching_rules(string)
|
||||||
|
|> apply_matches(string)
|
||||||
|
|> longest_match_first
|
||||||
|
|
||||||
|
process_matches module, rules, matches, string, lexer, Enum.count(matches)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp process_matches(_, _, _, string, _, count) when count == 0 do
|
||||||
|
{:error, "String not in language: #{inspect string}"}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp process_matches(module, rules, matches, string, lexer, count) when count > 0 do
|
||||||
|
match = Enum.at matches, 0
|
||||||
|
|
||||||
|
# Execute the matches' action.
|
||||||
|
{len, value, fun} = match
|
||||||
|
result = apply(module, fun, [value])
|
||||||
|
|
||||||
|
lexer = process_result result, lexer
|
||||||
|
|
||||||
|
case lexer do
|
||||||
|
{ :error, _ } ->
|
||||||
|
lexer
|
||||||
|
_ ->
|
||||||
|
|
||||||
|
fragment = String.slice string, 0, len
|
||||||
|
line = lexer.line + line_number_incrementor fragment
|
||||||
|
column = column_number lexer, fragment
|
||||||
|
|
||||||
|
lexer = Map.merge(lexer, %{pos: lexer.pos + len,
|
||||||
|
line: line,
|
||||||
|
column: column})
|
||||||
|
|
||||||
|
# Are we at the end of the string?
|
||||||
|
if String.length(string) == len do
|
||||||
|
{ :ok, Enum.reverse lexer.tokens }
|
||||||
|
else
|
||||||
|
{ _ , new_string } = String.split_at string, len
|
||||||
|
do_lex module, rules, new_string, lexer
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp column_number lexer, match do
|
||||||
|
case Regex.match? ~r/[\r\n]/, match do
|
||||||
|
true ->
|
||||||
|
len = match |> split_on_newlines |> List.last |> String.length
|
||||||
|
case len do
|
||||||
|
0 -> 1
|
||||||
|
_ -> len
|
||||||
|
end
|
||||||
|
false ->
|
||||||
|
lexer.column + String.length match
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp line_number_incrementor match do
|
||||||
|
(match |> split_on_newlines |> Enum.count) - 1
|
||||||
|
end
|
||||||
|
|
||||||
|
defp split_on_newlines string do
|
||||||
|
string |> String.split(~r{(\r|\n|\r\n)})
|
||||||
|
end
|
||||||
|
|
||||||
|
defp process_result(result, lexer) when is_nil(result) do
|
||||||
|
pop_state lexer
|
||||||
|
end
|
||||||
|
|
||||||
|
defp process_result(result, lexer) when is_atom(result) do
|
||||||
|
push_state lexer, result
|
||||||
|
end
|
||||||
|
|
||||||
|
defp process_result(result, lexer) when is_tuple(result) do
|
||||||
|
push_token lexer, result
|
||||||
|
end
|
||||||
|
|
||||||
|
defp process_result result, _ do
|
||||||
|
{:error, "Invalid result from action: #{inspect result}"}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp push_token lexer, token do
|
||||||
|
{tname, tvalue} = token
|
||||||
|
token = %Token{pos: lexer.pos,
|
||||||
|
line: lexer.line,
|
||||||
|
column: lexer.column,
|
||||||
|
name: tname,
|
||||||
|
value: tvalue}
|
||||||
|
Map.merge lexer, %{tokens: [token | lexer.tokens ]}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp push_state lexer, state do
|
||||||
|
Map.merge lexer, %{states: [state | lexer.states ]}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp pop_state lexer do
|
||||||
|
[ _ | states ] = lexer.states
|
||||||
|
Map.merge lexer, %{states: states}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp rules_for_state rules, state do
|
||||||
|
Enum.filter rules, fn({rule_state,_,_}) ->
|
||||||
|
state = if is_nil(state) do
|
||||||
|
:default
|
||||||
|
else
|
||||||
|
state
|
||||||
|
end
|
||||||
|
state == rule_state
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp matching_rules rules, string do
|
||||||
|
Enum.filter rules, fn({_,regex,_}) ->
|
||||||
|
Regex.match?(regex, string)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp apply_matches rules, string do
|
||||||
|
Enum.map rules, fn({_,regex,fun}) ->
|
||||||
|
[match] = Regex.run(regex,string, capture: :first)
|
||||||
|
{ String.length(match), match, fun }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp longest_match_first matches do
|
||||||
|
Enum.sort_by matches, fn({len,_,_}) -> len end, &>=/2
|
||||||
|
end
|
||||||
|
end
|
5
lib/lex_luthor/state.ex
Normal file
5
lib/lex_luthor/state.ex
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
defmodule LexLuthor.State do
|
||||||
|
defstruct pos: 0, line: 1, column: 0, states: [nil], tokens: []
|
||||||
|
|
||||||
|
@moduledoc false
|
||||||
|
end
|
8
lib/lex_luthor/token.ex
Normal file
8
lib/lex_luthor/token.ex
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
defmodule LexLuthor.Token do
|
||||||
|
defstruct pos: 0, line: 1, column: 0, name: nil, value: nil
|
||||||
|
|
||||||
|
@moduledoc """
|
||||||
|
Defines an individual token in the lexer output, along with handy stuff like
|
||||||
|
the line and column numbers.
|
||||||
|
"""
|
||||||
|
end
|
3
mix.exs
3
mix.exs
|
@ -35,7 +35,8 @@ defmodule LexLuthor.Mixfile do
|
||||||
defp deps do
|
defp deps do
|
||||||
[
|
[
|
||||||
{:ex_doc, ">= 0.0.0", only: :dev},
|
{:ex_doc, ">= 0.0.0", only: :dev},
|
||||||
{:inch_ex, only: :docs}
|
{:inch_ex, only: :docs},
|
||||||
|
{:credo, only: ~w(dev test)a}
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
4
mix.lock
4
mix.lock
|
@ -1,4 +1,6 @@
|
||||||
%{"earmark": {:hex, :earmark, "1.0.1", "2c2cd903bfdc3de3f189bd9a8d4569a075b88a8981ded9a0d95672f6e2b63141", [:mix], []},
|
%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
|
||||||
|
"credo": {:hex, :credo, "0.7.3", "9827ab04002186af1aec014a811839a06f72aaae6cd5eed3919b248c8767dbf3", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
|
||||||
|
"earmark": {:hex, :earmark, "1.0.1", "2c2cd903bfdc3de3f189bd9a8d4569a075b88a8981ded9a0d95672f6e2b63141", [:mix], []},
|
||||||
"ex_doc": {:hex, :ex_doc, "0.13.0", "aa2f8fe4c6136a2f7cfc0a7e06805f82530e91df00e2bff4b4362002b43ada65", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
|
"ex_doc": {:hex, :ex_doc, "0.13.0", "aa2f8fe4c6136a2f7cfc0a7e06805f82530e91df00e2bff4b4362002b43ada65", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
|
||||||
"inch_ex": {:hex, :inch_ex, "0.5.6", "418357418a553baa6d04eccd1b44171936817db61f4c0840112b420b8e378e67", [:mix], [{:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, optional: false]}]},
|
"inch_ex": {:hex, :inch_ex, "0.5.6", "418357418a553baa6d04eccd1b44171936817db61f4c0840112b420b8e378e67", [:mix], [{:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, optional: false]}]},
|
||||||
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], []}}
|
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], []}}
|
||||||
|
|
4
test/lex_luthor_test.exs
Normal file
4
test/lex_luthor_test.exs
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
defmodule LexLuthorTest do
|
||||||
|
use ExUnit.Case
|
||||||
|
doctest LexLuthor
|
||||||
|
end
|
Loading…
Reference in a new issue