Fix for expensive string length check while consuming input. #4

Merged
jimsynz merged 2 commits from fix-big-o into master 2021-01-28 08:49:50 +13:00
8 changed files with 172 additions and 153 deletions

4
.formatter.exs Normal file
View file

@ -0,0 +1,4 @@
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
]

View file

@ -40,8 +40,8 @@ defmodule LexLuthor do
defmacro __before_compile__(_env) do defmacro __before_compile__(_env) do
quote do quote do
def lex string do def lex(string) do
Runner.lex __MODULE__, @rules, string Runner.lex(__MODULE__, @rules, string)
end end
end end
end end
@ -53,21 +53,24 @@ defmodule LexLuthor do
- `state` the lexer state in which this rule applies. - `state` the lexer state in which this rule applies.
- `action` the function to execute when this rule is applied. - `action` the function to execute when this rule is applied.
""" """
@spec defrule(Regex.t, atom, (String.t -> atom | nil | {atom, any})) :: {:ok, non_neg_integer} @spec defrule(Regex.t(), atom, (String.t() -> atom | nil | {atom, any})) ::
{:ok, non_neg_integer}
defmacro defrule(regex, state, action) do defmacro defrule(regex, state, action) do
quote do quote do
@action_counter(@action_counter + 1) @action_counter @action_counter + 1
action_name = "_action_#{@action_counter}" |> String.to_atom action_name = "_action_#{@action_counter}" |> String.to_atom()
action = unquote(Macro.escape(action)) action = unquote(Macro.escape(action))
defaction = quote do defaction =
def unquote(Macro.escape(action_name))(e) do quote do
unquote(action).(e) def unquote(Macro.escape(action_name))(e) do
unquote(action).(e)
end
end end
end
Module.eval_quoted __MODULE__, defaction
@rules(@rules ++ [{unquote(state), unquote(regex), action_name}]) Module.eval_quoted(__MODULE__, defaction)
@rules @rules ++ [{unquote(state), unquote(regex), action_name}]
{:ok, Enum.count(@rules)} {:ok, Enum.count(@rules)}
end end
end end
@ -80,7 +83,7 @@ defmodule LexLuthor do
""" """
defmacro defrule(regex, action) do defmacro defrule(regex, action) do
quote do quote do
defrule unquote(regex), :default, unquote(action) defrule(unquote(regex), :default, unquote(action))
end end
end end
end end

View file

@ -15,142 +15,146 @@ defmodule LexLuthor.Runner do
- `rules` an array of rules to apply to the input string. - `rules` an array of rules to apply to the input string.
- `string` the input string to be lexed. - `string` the input string to be lexed.
""" """
@spec lex(atom, [{atom, Regex.t, String.t}], String.t) :: {:ok, non_neg_integer} @spec lex(atom, [{atom, Regex.t(), String.t()}], String.t()) :: {:ok, non_neg_integer}
def lex module, rules, string do def lex(module, rules, string) do
do_lex module, rules, string, %State{} do_lex(module, rules, string, %State{})
end end
defp do_lex module, rules, string, lexer do defp do_lex(module, rules, string, lexer) do
[current_state | _rest] = lexer.states [current_state | _rest] = lexer.states
# Find the longest matching rule. This could # Find the longest matching rule. This could
# probably be made a whole lot less enumeratey. # probably be made a whole lot less enumeratey.
matches = rules matches =
rules
|> rules_for_state(current_state) |> rules_for_state(current_state)
|> matching_rules(string) |> matching_rules(string)
|> apply_matches(string) |> apply_matches(string)
|> longest_match_first |> longest_match_first
process_matches module, rules, matches, string, lexer, Enum.count(matches) process_matches(module, rules, matches, string, lexer, Enum.count(matches))
end end
defp process_matches(_, _, _, string, _, count) when count == 0 do defp process_matches(_, _, _, string, _, count) when count == 0 do
{:error, "String not in language: #{inspect string}"} {:error, "String not in language: #{inspect(string)}"}
end end
defp process_matches(module, rules, matches, string, lexer, count) when count > 0 do defp process_matches(module, rules, matches, string, lexer, count) when count > 0 do
match = Enum.at matches, 0 match = Enum.at(matches, 0)
# Execute the matches' action. # Execute the matches' action.
{len, value, fun} = match {len, value, fun} = match
result = apply(module, fun, [value]) result = apply(module, fun, [value])
lexer = process_result result, lexer lexer = process_result(result, lexer)
case lexer do case lexer do
{ :error, _ } -> {:error, _} ->
lexer lexer
_ -> _ ->
fragment = String.slice(string, 0, len)
line = lexer.line + line_number_incrementor(fragment)
column = column_number(lexer, fragment)
fragment = String.slice string, 0, len lexer = Map.merge(lexer, %{pos: lexer.pos + len, line: line, column: column})
line = lexer.line + line_number_incrementor fragment
column = column_number lexer, fragment
lexer = Map.merge(lexer, %{pos: lexer.pos + len, case String.split_at(string, len) do
line: line, {_, ""} -> {:ok, Enum.reverse(lexer.tokens)}
column: column}) {_, new_string} -> do_lex(module, rules, new_string, lexer)
# Are we at the end of the string?
if String.length(string) == len do
{ :ok, Enum.reverse lexer.tokens }
else
{ _ , new_string } = String.split_at string, len
do_lex module, rules, new_string, lexer
end end
end end
end end
defp column_number lexer, match do defp column_number(lexer, match) do
case Regex.match? ~r/[\r\n]/, match do case Regex.match?(~r/[\r\n]/, match) do
true -> true ->
len = match |> split_on_newlines |> List.last |> String.length len = match |> split_on_newlines |> List.last() |> String.length()
case len do case len do
0 -> 1 0 -> 1
_ -> len _ -> len
end end
false -> false ->
lexer.column + String.length match lexer.column + String.length(match)
end end
end end
defp line_number_incrementor match do defp line_number_incrementor(match) do
(match |> split_on_newlines |> Enum.count) - 1 (match |> split_on_newlines |> Enum.count()) - 1
end end
defp split_on_newlines string do defp split_on_newlines(string) do
string |> String.split(~r{(\r|\n|\r\n)}) string |> String.split(~r{(\r|\n|\r\n)})
end end
defp process_result(result, lexer) when is_nil(result) do defp process_result(result, lexer) when is_nil(result) do
pop_state lexer pop_state(lexer)
end end
defp process_result(result, lexer) when is_atom(result) do defp process_result(result, lexer) when is_atom(result) do
push_state lexer, result push_state(lexer, result)
end end
defp process_result(result, lexer) when is_tuple(result) do defp process_result(result, lexer) when is_tuple(result) do
push_token lexer, result push_token(lexer, result)
end end
defp process_result result, _ do defp process_result(result, _) do
{:error, "Invalid result from action: #{inspect result}"} {:error, "Invalid result from action: #{inspect(result)}"}
end end
defp push_token lexer, token do defp push_token(lexer, token) do
{tname, tvalue} = token {tname, tvalue} = token
token = %Token{pos: lexer.pos,
line: lexer.line, token = %Token{
column: lexer.column, pos: lexer.pos,
name: tname, line: lexer.line,
value: tvalue} column: lexer.column,
Map.merge lexer, %{tokens: [token | lexer.tokens ]} name: tname,
value: tvalue
}
Map.merge(lexer, %{tokens: [token | lexer.tokens]})
end end
defp push_state lexer, state do defp push_state(lexer, state) do
Map.merge lexer, %{states: [state | lexer.states ]} Map.merge(lexer, %{states: [state | lexer.states]})
end end
defp pop_state lexer do defp pop_state(lexer) do
[ _ | states ] = lexer.states [_ | states] = lexer.states
Map.merge lexer, %{states: states} Map.merge(lexer, %{states: states})
end end
defp rules_for_state rules, state do defp rules_for_state(rules, state) do
Enum.filter rules, fn({rule_state,_,_}) -> Enum.filter(rules, fn {rule_state, _, _} ->
state = if is_nil(state) do state =
:default if is_nil(state) do
else :default
state else
end state
end
state == rule_state state == rule_state
end end)
end end
defp matching_rules rules, string do defp matching_rules(rules, string) do
Enum.filter rules, fn({_,regex,_}) -> Enum.filter(rules, fn {_, regex, _} ->
Regex.match?(regex, string) Regex.match?(regex, string)
end end)
end end
defp apply_matches rules, string do defp apply_matches(rules, string) do
Enum.map rules, fn({_,regex,fun}) -> Enum.map(rules, fn {_, regex, fun} ->
[match] = Regex.run(regex,string, capture: :first) [match] = Regex.run(regex, string, capture: :first)
{ String.length(match), match, fun } {String.length(match), match, fun}
end end)
end end
defp longest_match_first matches do defp longest_match_first(matches) do
Enum.sort_by matches, fn({len,_,_}) -> len end, &>=/2 Enum.sort_by(matches, fn {len, _, _} -> len end, &>=/2)
end end
end end

30
mix.exs
View file

@ -2,18 +2,21 @@ defmodule LexLuthor.Mixfile do
use Mix.Project use Mix.Project
def project do def project do
[app: :lex_luthor, [
version: "0.1.1", app: :lex_luthor,
elixir: "~> 1.0", version: "0.1.1",
description: "LexLuthor is a Lexer in Elixir (say that 10 times fast) which uses macros to generate a reusable lexers. Good times.", elixir: "~> 1.0",
source_url: "https://github.com/jamesotron/lex_luthor", description:
preferred_cli_env: [inch: :docs], "LexLuthor is a Lexer in Elixir (say that 10 times fast) which uses macros to generate a reusable lexers. Good times.",
package: [ source_url: "https://github.com/jamesotron/lex_luthor",
contributors: ["James Harton"], preferred_cli_env: [inch: :docs],
licenses: ["MIT"], package: [
links: %{"Source" => "https://github.com/jamesotron/lex_luthor"} contributors: ["James Harton"],
], licenses: ["MIT"],
deps: deps()] links: %{"Source" => "https://github.com/jamesotron/lex_luthor"}
],
deps: deps()
]
end end
# Configuration for the OTP application # Configuration for the OTP application
@ -35,8 +38,7 @@ defmodule LexLuthor.Mixfile do
defp deps do defp deps do
[ [
{:ex_doc, ">= 0.0.0", only: :dev}, {:ex_doc, ">= 0.0.0", only: :dev},
{:inch_ex, only: :docs}, {:credo, "~> 1.5", only: ~w(dev test)a}
{:credo, only: ~w(dev test)a}
] ]
end end
end end

View file

@ -1,6 +1,10 @@
%{"bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []}, %{
"credo": {:hex, :credo, "0.7.3", "9827ab04002186af1aec014a811839a06f72aaae6cd5eed3919b248c8767dbf3", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]}, "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm", "7af5c7e09fe1d40f76c8e4f9dd2be7cebd83909f31fee7cd0e9eadc567da8353"},
"earmark": {:hex, :earmark, "1.0.1", "2c2cd903bfdc3de3f189bd9a8d4569a075b88a8981ded9a0d95672f6e2b63141", [:mix], []}, "credo": {:hex, :credo, "1.5.4", "9914180105b438e378e94a844ec3a5088ae5875626fc945b7c1462b41afc3198", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2.8", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "cf51af45eadc0a3f39ba13b56fdac415c91b34f7b7533a13dc13550277141bc4"},
"ex_doc": {:hex, :ex_doc, "0.13.0", "aa2f8fe4c6136a2f7cfc0a7e06805f82530e91df00e2bff4b4362002b43ada65", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]}, "earmark": {:hex, :earmark, "1.0.1", "2c2cd903bfdc3de3f189bd9a8d4569a075b88a8981ded9a0d95672f6e2b63141", [:mix], [], "hexpm", "db7b13d74a9edc54d3681762154d164d4a661cd27673cca80760344449877664"},
"inch_ex": {:hex, :inch_ex, "0.5.6", "418357418a553baa6d04eccd1b44171936817db61f4c0840112b420b8e378e67", [:mix], [{:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, optional: false]}]}, "ex_doc": {:hex, :ex_doc, "0.13.0", "aa2f8fe4c6136a2f7cfc0a7e06805f82530e91df00e2bff4b4362002b43ada65", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, repo: "hexpm", optional: false]}], "hexpm", "4b40cd154c2660d795b88f73c61b5e3679abe7215e8c20eb9040101cc4819d12"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], []}} "file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
"inch_ex": {:hex, :inch_ex, "0.5.6", "418357418a553baa6d04eccd1b44171936817db61f4c0840112b420b8e378e67", [:mix], [{:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, repo: "hexpm", optional: false]}], "hexpm", "7123ca0450686a61416a06cd38e26af18fd0f8c1cff5214770a957c6e0724338"},
"jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"},
}

View file

@ -3,50 +3,50 @@ defmodule AcceptanceTest do
use ExUnit.Case, async: true use ExUnit.Case, async: true
@tests [ @tests [
{ "''", generate_token(:simple_string, "") }, {"''", generate_token(:simple_string, "")},
{ "'hello'", generate_token(:simple_string, "hello") }, {"'hello'", generate_token(:simple_string, "hello")},
{ "\"\"", generate_token(:string, "") }, {"\"\"", generate_token(:string, "")},
{ "\"hello\"", generate_token(:string, "hello") }, {"\"hello\"", generate_token(:string, "hello")},
{ "0", generate_token(:integer, 0) }, {"0", generate_token(:integer, 0)},
{ "123", generate_token(:integer, 123) }, {"123", generate_token(:integer, 123)},
{ "0x123", generate_token(:integer, 291) }, {"0x123", generate_token(:integer, 291)},
{ "0b1011", generate_token(:integer, 11) }, {"0b1011", generate_token(:integer, 11)},
{ "0.0", generate_token(:float, 0.0) }, {"0.0", generate_token(:float, 0.0)},
{ "123.456", generate_token(:float, 123.456) } {"123.456", generate_token(:float, 123.456)}
] ]
Enum.each @tests, fn Enum.each(@tests, fn
{ source, token } -> {source, token} ->
tname = Map.get token, :name tname = Map.get(token, :name)
tvalue = Map.get token, :value tvalue = Map.get(token, :value)
test "String #{inspect(source)} results in token #{inspect(token)}" do test "String #{inspect(source)} results in token #{inspect(token)}" do
result = ExampleLexer.lex unquote(source) result = ExampleLexer.lex(unquote(source))
{ok, result} = result {ok, result} = result
assert ok == :ok assert ok == :ok
result = Enum.at(result, 0) result = Enum.at(result, 0)
rname = Map.get result, :name rname = Map.get(result, :name)
rvalue = Map.get result, :value rvalue = Map.get(result, :value)
assert rname == unquote(tname) assert rname == unquote(tname)
assert rvalue == unquote(tvalue) assert rvalue == unquote(tvalue)
end end
end end)
test "String #{inspect "'foo'\n'bar'"} has correct line numbers" do test "String #{inspect("'foo'\n'bar'")} has correct line numbers" do
{ok, tokens} = ExampleLexer.lex "'foo'\n'bar'" {ok, tokens} = ExampleLexer.lex("'foo'\n'bar'")
assert ok == :ok assert ok == :ok
token = List.last tokens token = List.last(tokens)
assert token.line == 2 assert token.line == 2
end end
test "String #{inspect "'foo'\n'bar' 'baz'"} has correct column numbers" do test "String #{inspect("'foo'\n'bar' 'baz'")} has correct column numbers" do
{ok, tokens} = ExampleLexer.lex "'foo'\n'bar' 'baz'" {ok, tokens} = ExampleLexer.lex("'foo'\n'bar' 'baz'")
assert ok == :ok assert ok == :ok
token = List.last tokens token = List.last(tokens)
assert token.value == "baz" assert token.value == "baz"
assert token.column == 8 assert token.column == 8
end end

View file

@ -2,14 +2,14 @@ defmodule RejectionTest do
use ExUnit.Case, async: true use ExUnit.Case, async: true
test "string not in language fails" do test "string not in language fails" do
{status, message} = ExampleLexer.lex "{}" {status, message} = ExampleLexer.lex("{}")
assert status == :error assert status == :error
assert message == "String not in language: \"{}\"" assert message == "String not in language: \"{}\""
end end
test "bogus action" do test "bogus action" do
{status, message} = ExampleLexer.lex "BOGUS_ACTION" {status, message} = ExampleLexer.lex("BOGUS_ACTION")
assert status == :error assert status == :error
assert message == "Invalid result from action: \"WAT\"" assert message == "Invalid result from action: \"WAT\""
end end
end end

View file

@ -8,42 +8,44 @@ defmodule ExampleLexer do
use LexLuthor use LexLuthor
# single tick strings # single tick strings
defrule ~r/^''/, fn(_) -> { :simple_string, "" } end defrule(~r/^''/, fn _ -> {:simple_string, ""} end)
defrule ~r/^'/, fn(_) -> :simple_string end defrule(~r/^'/, fn _ -> :simple_string end)
defrule ~r/^[^']+/, :simple_string, fn(e) -> { :simple_string, e } end defrule(~r/^[^']+/, :simple_string, fn e -> {:simple_string, e} end)
defrule ~r/^'/, :simple_string, fn(_) -> nil end defrule(~r/^'/, :simple_string, fn _ -> nil end)
# double tick strings # double tick strings
defrule ~r/^""/, fn(_) -> { :string, "" } end defrule(~r/^""/, fn _ -> {:string, ""} end)
defrule ~r/^"/, fn(_) -> :string end defrule(~r/^"/, fn _ -> :string end)
defrule ~R/^#{/, :string, fn(_) -> :default end defrule(~R/^#{/, :string, fn _ -> :default end)
defrule ~R/^}/, :default, fn(_) -> nil end defrule(~R/^}/, :default, fn _ -> nil end)
defrule ~R/^[^("|#{)]+/, :string, fn(e) -> { :string, e } end defrule(~R/^[^("|#{)]+/, :string, fn e -> {:string, e} end)
defrule ~r/^"/, :string, fn(_) -> nil end defrule(~r/^"/, :string, fn _ -> nil end)
# floats # floats
defrule ~r/^[0-9]+\.[0-9]+/, fn(e) -> { :float, String.to_float(e) } end defrule(~r/^[0-9]+\.[0-9]+/, fn e -> {:float, String.to_float(e)} end)
# integers # integers
defrule ~r/^0x[0-9a-fA-F]+/, fn(e) -> defrule(~r/^0x[0-9a-fA-F]+/, fn e ->
[ _ | i ] = String.split e, "x" [_ | i] = String.split(e, "x")
{ :integer, String.to_integer(Enum.at(i, 0), 16) } {:integer, String.to_integer(Enum.at(i, 0), 16)}
end end)
defrule ~r/^0b[01]+/, fn(e) ->
[ _ | i ] = String.split e, "b" defrule(~r/^0b[01]+/, fn e ->
{ :integer, String.to_integer(Enum.at(i, 0), 2) } [_ | i] = String.split(e, "b")
end {:integer, String.to_integer(Enum.at(i, 0), 2)}
defrule ~r/^[1-9][0-9]*/, fn(e) -> { :integer, String.to_integer(e) } end end)
defrule ~r/^0/, fn(_) -> { :integer, 0 } end
defrule(~r/^[1-9][0-9]*/, fn e -> {:integer, String.to_integer(e)} end)
defrule(~r/^0/, fn _ -> {:integer, 0} end)
# white space # white space
defrule ~r/^[ \t]+/, fn(e) -> { :ws, String.length(e) } end defrule(~r/^[ \t]+/, fn e -> {:ws, String.length(e)} end)
defrule ~r/^\r\n/, fn(_) -> { :nl, 1 } end defrule(~r/^\r\n/, fn _ -> {:nl, 1} end)
defrule ~r/^\r/, fn(_) -> { :nl, 1 } end defrule(~r/^\r/, fn _ -> {:nl, 1} end)
defrule ~r/^\n/, fn(_) -> { :nl, 1 } end defrule(~r/^\n/, fn _ -> {:nl, 1} end)
# bogus action # bogus action
defrule ~r/^BOGUS_ACTION/, fn(_) -> "WAT" end defrule(~r/^BOGUS_ACTION/, fn _ -> "WAT" end)
end end
ExUnit.start() ExUnit.start()