Documentation, test and readability improvements.

* Split out different concerns of LexLuthor into different modules. * Add `inch` and documentation. * Add `credo` and fix warnings.
2017-04-27 14:55:25 +12:00 · 2017-04-27 14:55:25 +12:00 · 1b964adac3
parent 51c691d8d3
commit 1b964adac3
7 changed files with 228 additions and 148 deletions
--- a/lib/lex_luthor.ex
+++ b/lib/lex_luthor.ex
@ -1,11 +1,33 @@
 defmodule LexLuthor do
-  defmodule State do
+  alias LexLuthor.Runner
    defstruct pos: 0, line: 1, column: 0, states: [nil], tokens: []
  end
-  defmodule Token do
+  @moduledoc """
-    defstruct pos: 0, line: 1, column: 0, name: nil, value: nil
+  LexLuthor is a Lexer in Elixir (say that 10 times fast) which uses macros to generate a reusable lexers. Good times.
-  end
+
  LexLuthor is a state based lexer, meaning that it keeps a state stack which you can push states on and pop states off the stack, which are used to filter the applicable rules for a given state.  For example:
      iex> defmodule StringLexer do
      ...>   use LexLuthor
      ...>   defrule ~r/^'/,              fn(_) -> :STRING end
      ...>   defrule ~r/^[^']+/, :STRING, fn(e) -> { :string, e } end
      ...>   defrule ~r/^'/,     :STRING, fn(_) -> nil end
      ...> end
      ...> StringLexer.lex("'foo'")
      {:ok, [%LexLuthor.Token{column: 1, line: 1, name: :string, pos: 1, value: "foo"}]}
  Rules are defined by a regular expression, an optional state (as an atom) and an action in the form of an anonymous function.
  When passed the string `'foo'`, the lexer starts in the `:default` state, so it filters for rules in the default state (the first rule, as it doesn't specify a state), then it filters the available rules by the longest matching regular expression.  In this case, since we have only one rule (which happens to match) it's automatically the longest match.
  Once the longest match is found, then it's action is executed and the return value matched:
    - If the return value is a single atom then that atom is assumed to be a state and push onto the top of the state stack.
    - If the return value is a two element tuple then the first element is expected to be an atom (the token name) and the second element a value for this token.
    - If the return value is `nil` then the top state is popped off the state stack.
  If lexing succeeds then you will receive an `:ok` tuple with the second value being a list of `LexLuthor.Token` structs.
  If lexing fails then you will receive an `:error` tuple which a reason and position.
  """
  defmacro __using__(_opts) do
    quote do
@ -19,164 +41,46 @@ defmodule LexLuthor do
  defmacro __before_compile__(_env) do
    quote do
      def lex string do
-        LexLuthor.lex __MODULE__, @rules, string
+        Runner.lex __MODULE__, @rules, string
      end
    end
  end
-  defmacro defrule(regex, state, block) do
+  @doc """
  Define a lexing rule for a specific state.
  - `regex` a regular expression for matching against the input string.
  - `state` the lexer state in which this rule applies.
  - `action` the function to execute when this rule is applied.
  """
  @spec defrule(Regex.t, atom, (String.t -> atom | nil | {atom, any})) :: {:ok, non_neg_integer}
  defmacro defrule(regex, state, action) do
    quote do
      @action_counter(@action_counter + 1)
      action_name = "_action_#{@action_counter}" |> String.to_atom
-      block       = unquote(Macro.escape(block))
+      action       = unquote(Macro.escape(action))
      defaction = quote do
        def unquote(Macro.escape(action_name))(e) do
-          unquote(block).(e)
+          unquote(action).(e)
        end
      end
      Module.eval_quoted __MODULE__, defaction
-      @rules(@rules ++ [{ unquote(state), unquote(regex), action_name }])
+      @rules(@rules ++ [{unquote(state), unquote(regex), action_name}])
-      { :ok, Enum.count(@rules) }
+      {:ok, Enum.count(@rules)}
    end
  end
-  defmacro defrule(regex, block) do
+  @doc """
  Define a lexing rule applicable to the default state.
  - `regex` a regular expression for matching against the input string.
  - `action` the function to execute when this rule is applied.
  """
  defmacro defrule(regex, action) do
    quote do
-      defrule unquote(regex), :default, unquote(block)
+      defrule unquote(regex), :default, unquote(action)
    end
  end
  def lex module, rules, string do
    do_lex module, rules, string, %State{}
  end
  defp do_lex module, rules, string, lexer do
    [ current_state | _rest ] = lexer.states
    # Find the longest matching rule. This could
    # probably be made a whole lot less enumeratey.
    matches = rules_for_state(rules, current_state)
      |> matching_rules(string)
      |> apply_matches(string)
      |> longest_match_first
    process_matches module, rules, matches, string, lexer, Enum.count(matches)
  end
  defp process_matches(_, _, _, string, _, count) when count == 0 do
    { :error, "String not in language: #{inspect string}"}
  end
  defp process_matches(module, rules, matches, string, lexer, count) when count > 0 do
    match = Enum.at matches, 0
    # Execute the matches' action.
    {len, value, fun} = match
    result = apply(module, fun, [value])
    lexer = process_result result, lexer
    case lexer do
      { :error, _ } ->
        lexer
      _ ->
        fragment = String.slice string, 0, len
        line     = lexer.line + line_number_incrementor fragment
        column   = column_number lexer, fragment
        lexer = Map.merge lexer, %{pos: lexer.pos + len, line: line, column: column}
        # Are we at the end of the string?
        if String.length(string) == len do
          { :ok, Enum.reverse lexer.tokens }
        else
          { _ , new_string } = String.split_at string, len
          do_lex module, rules, new_string, lexer
        end
    end
  end
  defp column_number lexer, match do
    case Regex.match? ~r/[\r\n]/, match do
      true ->
        len = match |> split_on_newlines |> List.last |> String.length
        case len do
          0 -> 1
          _ -> len
        end
      false ->
        lexer.column + String.length match
    end
  end
  defp line_number_incrementor match do
    (match |> split_on_newlines |> Enum.count) - 1
  end
  def split_on_newlines string do
    string |> String.split(~r{(\r|\n|\r\n)})
  end
  defp process_result(result, lexer) when is_nil(result) do
    pop_state lexer
  end
  defp process_result(result, lexer) when is_atom(result) do
    push_state lexer, result
  end
  defp process_result(result, lexer) when is_tuple(result) do
    push_token lexer, result
  end
  defp process_result result, _ do
    { :error, "Invalid result from action: #{inspect result}"}
  end
  defp push_token lexer, token do
    { tname, tvalue } = token
    token = %Token{ pos: lexer.pos, line: lexer.line, column: lexer.column, name: tname, value: tvalue }
    Map.merge lexer, %{tokens: [token | lexer.tokens ]}
  end
  defp push_state lexer, state do
    Map.merge lexer, %{states: [state | lexer.states ]}
  end
  defp pop_state lexer do
    [ _ | states ] = lexer.states
    Map.merge lexer, %{states: states}
  end
  defp rules_for_state rules, state do
    Enum.filter rules, fn({rule_state,_,_})->
      state = if is_nil(state) do
        :default
      else
        state
      end
      state == rule_state
    end
  end
  defp matching_rules rules, string do
    Enum.filter rules, fn({_,regex,_})->
      Regex.match?(regex, string)
    end
  end
  defp apply_matches rules, string do
    Enum.map rules, fn({_,regex,fun})->
      [match] = Regex.run(regex,string, capture: :first)
      { String.length(match), match, fun }
    end
  end
  defp longest_match_first matches do
    Enum.sort_by matches, fn({len,_,_})-> len end, &>=/2
  end
 end
--- a/lib/lex_luthor/runner.ex
+++ b/lib/lex_luthor/runner.ex
@ -0,0 +1,156 @@
 defmodule LexLuthor.Runner do
  alias LexLuthor.{State, Token}
  @moduledoc """
  This module runs a Lexer module against an input string.
  You don't use it directly as `YourModule.lex/1` is defined on
  your module when you `use LexLuthor`.
  """
  @doc """
  Process a string against a given Lexer module and rules.
    - `module` the module in which the lexer is defined.
    - `rules` an array of rules to apply to the input string.
    - `string` the input string to be lexed.
  """
  @spec lex(atom, [{atom, Regex.t, String.t}], String.t) :: {:ok, non_neg_integer}
  def lex module, rules, string do
    do_lex module, rules, string, %State{}
  end
  defp do_lex module, rules, string, lexer do
    [current_state | _rest] = lexer.states
    # Find the longest matching rule. This could
    # probably be made a whole lot less enumeratey.
    matches = rules
      |> rules_for_state(current_state)
      |> matching_rules(string)
      |> apply_matches(string)
      |> longest_match_first
    process_matches module, rules, matches, string, lexer, Enum.count(matches)
  end
  defp process_matches(_, _, _, string, _, count) when count == 0 do
    {:error, "String not in language: #{inspect string}"}
  end
  defp process_matches(module, rules, matches, string, lexer, count) when count > 0 do
    match = Enum.at matches, 0
    # Execute the matches' action.
    {len, value, fun} = match
    result = apply(module, fun, [value])
    lexer = process_result result, lexer
    case lexer do
      { :error, _ } ->
        lexer
      _ ->
        fragment = String.slice string, 0, len
        line     = lexer.line + line_number_incrementor fragment
        column   = column_number lexer, fragment
        lexer = Map.merge(lexer, %{pos:    lexer.pos + len,
                                   line:   line,
                                   column: column})
        # Are we at the end of the string?
        if String.length(string) == len do
          { :ok, Enum.reverse lexer.tokens }
        else
          { _ , new_string } = String.split_at string, len
          do_lex module, rules, new_string, lexer
        end
    end
  end
  defp column_number lexer, match do
    case Regex.match? ~r/[\r\n]/, match do
      true ->
        len = match |> split_on_newlines |> List.last |> String.length
        case len do
          0 -> 1
          _ -> len
        end
      false ->
        lexer.column + String.length match
    end
  end
  defp line_number_incrementor match do
    (match |> split_on_newlines |> Enum.count) - 1
  end
  defp split_on_newlines string do
    string |> String.split(~r{(\r|\n|\r\n)})
  end
  defp process_result(result, lexer) when is_nil(result) do
    pop_state lexer
  end
  defp process_result(result, lexer) when is_atom(result) do
    push_state lexer, result
  end
  defp process_result(result, lexer) when is_tuple(result) do
    push_token lexer, result
  end
  defp process_result result, _ do
    {:error, "Invalid result from action: #{inspect result}"}
  end
  defp push_token lexer, token do
    {tname, tvalue} = token
    token = %Token{pos:    lexer.pos,
                   line:   lexer.line,
                   column: lexer.column,
                   name:   tname,
                   value:  tvalue}
    Map.merge lexer, %{tokens: [token | lexer.tokens ]}
  end
  defp push_state lexer, state do
    Map.merge lexer, %{states: [state | lexer.states ]}
  end
  defp pop_state lexer do
    [ _ | states ] = lexer.states
    Map.merge lexer, %{states: states}
  end
  defp rules_for_state rules, state do
    Enum.filter rules, fn({rule_state,_,_}) ->
      state = if is_nil(state) do
        :default
      else
        state
      end
      state == rule_state
    end
  end
  defp matching_rules rules, string do
    Enum.filter rules, fn({_,regex,_}) ->
      Regex.match?(regex, string)
    end
  end
  defp apply_matches rules, string do
    Enum.map rules, fn({_,regex,fun}) ->
      [match] = Regex.run(regex,string, capture: :first)
      { String.length(match), match, fun }
    end
  end
  defp longest_match_first matches do
    Enum.sort_by matches, fn({len,_,_}) -> len end, &>=/2
  end
 end
--- a/lib/lex_luthor/state.ex
+++ b/lib/lex_luthor/state.ex
@ -0,0 +1,5 @@
 defmodule LexLuthor.State do
  defstruct pos: 0, line: 1, column: 0, states: [nil], tokens: []
  @moduledoc false
 end
--- a/lib/lex_luthor/token.ex
+++ b/lib/lex_luthor/token.ex
@ -0,0 +1,8 @@
 defmodule LexLuthor.Token do
  defstruct pos: 0, line: 1, column: 0, name: nil, value: nil
  @moduledoc """
  Defines an individual token in the lexer output, along with handy stuff like
  the line and column numbers.
  """
 end
--- a/mix.exs
+++ b/mix.exs
@ -35,7 +35,8 @@ defmodule LexLuthor.Mixfile do
  defp deps do
    [
      {:ex_doc, ">= 0.0.0", only: :dev},
-      {:inch_ex, only: :docs}
+      {:inch_ex, only: :docs},
      {:credo,   only: ~w(dev test)a}
    ]
  end
 end
--- a/mix.lock
+++ b/mix.lock
@ -1,4 +1,6 @@
-%{"earmark": {:hex, :earmark, "1.0.1", "2c2cd903bfdc3de3f189bd9a8d4569a075b88a8981ded9a0d95672f6e2b63141", [:mix], []},
+%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
  "credo": {:hex, :credo, "0.7.3", "9827ab04002186af1aec014a811839a06f72aaae6cd5eed3919b248c8767dbf3", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
  "earmark": {:hex, :earmark, "1.0.1", "2c2cd903bfdc3de3f189bd9a8d4569a075b88a8981ded9a0d95672f6e2b63141", [:mix], []},
  "ex_doc": {:hex, :ex_doc, "0.13.0", "aa2f8fe4c6136a2f7cfc0a7e06805f82530e91df00e2bff4b4362002b43ada65", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]},
  "inch_ex": {:hex, :inch_ex, "0.5.6", "418357418a553baa6d04eccd1b44171936817db61f4c0840112b420b8e378e67", [:mix], [{:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, optional: false]}]},
  "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], []}}
--- a/test/lex_luthor_test.exs
+++ b/test/lex_luthor_test.exs
@ -0,0 +1,4 @@
 defmodule LexLuthorTest do
  use ExUnit.Case
  doctest LexLuthor
 end