Support for the FreeMonoTengwar font and ConScript encoding
[tengwarjs.git] / normalize.js
1 // TODO remove this since it canvases over the origin of certain clusters
2
3 // This module adapts streams of characters sent to one parser into a
4 // simplified normal form piped to another. Internally, a stream is
5 // represented as a function that accepts the next character and returns a new
6 // stream.
7 //
8 // stream("a")("b")("c") -> stream
9 //
10 // The input ends with an empty character.
11 //
12 // stream("") -> stream
13 //
14 // Functions that return streams and produce a syntax node accept a
15 // callback that like a stream is required to return the initial stream state.
16 //
17 // parseAbc(function (result) {
18 // console.log(result);
19 // return expectEof();
20 // })("a")("b")("c")("")
21 //
22
23 var Parser = require("./parser");
24 var makeTrie = require("./trie");
25 var makeParserFromTrie = require("./trie-parser");
26 var array_ = Array.prototype;
27
28 // The `normalize` function accepts a stream and returns a stream. The
29 // character sequence sent to the returned stream will be converted to a
30 // normal form, where each character is lower-case and various clusters of
31 // characters will be converted to a "normal" phonetic form so the subsequent
32 // parser only has to deal with one input for each phonetic output.
33 //
34 // normalize(parseWord(callback))("Q")("u")("x")
35 //
36 // In this example, the callback would receive "cwcs", the normal form of
37 // "Qux".
38 //
39 module.exports = normalize;
40 function normalize(callback) {
41 return toLowerCase(simplify(callback));
42 };
43
44 // This is a parser adapter that always returns the same state, but internally
45 // tracks the state of the wrapped parser. Each time the adapter receives a
46 // character, it converts it to lower case and uses that character to advance
47 // the state.
48 function toLowerCase(callback) {
49 return function passthrough(character) {
50 callback = callback(character.toLowerCase());
51 return passthrough;
52 };
53 }
54
55 // the keys of this table are characters and clusters of characters that must
56 // be simplified to the corresponding values before pumping them into an
57 // adapted parser. The adapted parser therefore only needs to handle the
58 // normal phonetic form of the cluster.
59 var table = {
60 "k": "c",
61 "x": "cs",
62 "q": "cw",
63 "qu": "cw",
64 "p": "p",
65 "ph": "f",
66 "b": "b",
67 "bh": "v",
68 "â": "á",
69 "ê": "é",
70 "î": "í",
71 "ô": "ó",
72 "û": "ú"
73 };
74
75 // This generates a data structure that can be walked by a parser, where each
76 // node corresponds to having parsed a certain prefix and follows to each
77 // common suffix. If the parser is standing at a particular node of the trie
78 // and receives a character that does not match any of the subsequent subtrees,
79 // it "produces" the corresponding value at that node.
80 var trie = makeTrie(table);
81
82 var simplify = makeParserFromTrie(
83 trie,
84 function makeProducer(string) {
85 // producing string involves advancing the state by individual
86 // characters.
87 return function (callback) {
88 return Array.prototype.reduce.call(string, function (callback, character) {
89 return callback(character);
90 }, callback);
91 };
92 },
93 function callback(callback) {
94 // after a match has been emitted, loop back for another
95 return simplify(callback);
96 },
97 function fallback(callback) {
98 // if we reach a character that is not accounted for in the table, pass
99 // it through without alternation, then start scanning for matches
100 // again
101 return function (character) {
102 return simplify(callback(character));
103 };
104 }
105 );
106