Skip to content

Commit e952a36

Browse files
committed
Copy silnlp gloss-cleaning functions exactly; separate rendering and gloss functions.
1 parent 13bf3d2 commit e952a36

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,9 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
135135
.Select(kvp =>
136136
{
137137
string id = kvp.Item1.Replace("\n", "&#xA");
138-
string gloss = kvp.Item2.Element("Renderings").Value;
139-
IReadOnlyList<string> glosses = GetGlosses(gloss);
140-
return (id, glosses);
138+
string rendering = kvp.Item2.Element("Renderings").Value;
139+
IReadOnlyList<string> renderings = GetRenderings(rendering);
140+
return (id, renderings);
141141
})
142142
.GroupBy(kvp => kvp.Item1, kvp => kvp.Item2) //Handle duplicate term ids (which do exist) e.g. שִׁלֵּמִי
143143
.Select(grouping => (grouping.Key, grouping.SelectMany(g => g)))
@@ -202,27 +202,39 @@ IDictionary<string, ImmutableHashSet<VerseRef>> termIdToReferences
202202
);
203203
}
204204

205+
private static string CleanTerm(string term)
206+
{
207+
term = term.Trim();
208+
term = StripParens(term);
209+
term = string.Join(" ", term.Split());
210+
return term;
211+
}
212+
205213
public static IReadOnlyList<string> GetGlosses(string gloss)
206214
{
207215
//If entire term rendering is surrounded in square brackets, remove them
208216
Match match = ContentInBracketsRegex.Match(gloss);
209217
if (match.Success)
210-
gloss = match.Groups[0].Value;
218+
gloss = match.Groups[1].Value;
211219
gloss = gloss.Replace("?", "");
212-
gloss = gloss.Replace("*", "");
213-
gloss = gloss.Replace("/", " ");
214-
gloss = gloss.Trim();
215-
gloss = StripParens(gloss);
220+
gloss = CleanTerm(gloss);
216221
gloss = StripParens(gloss, left: '[', right: ']');
217222
gloss = gloss.Trim();
218223
foreach (Match m in NumericalInformationRegex.Matches(gloss))
219224
{
220225
gloss.Replace(m.Value, "");
221226
}
222-
IEnumerable<string> glosses = Regex.Split(gloss, @"\|\|");
223-
glosses = glosses.SelectMany(g => g.Split(new char[] { ',', ';' }));
224-
glosses = glosses.Select(g => g.Trim()).Where(s => s != "").Distinct().ToList();
225-
return (IReadOnlyList<string>)glosses;
227+
return Regex.Split(gloss, @"[;,/]").Select(g => g.Trim()).Where(s => s != "").Distinct().ToList();
228+
}
229+
230+
public static IReadOnlyList<string> GetRenderings(string rendering)
231+
{
232+
return Regex
233+
.Split(rendering.Trim(), @"\|\|")
234+
.Select(r => CleanTerm(r))
235+
.Select(r => r.Replace("*", ""))
236+
.Where(r => r != "")
237+
.ToList();
226238
}
227239

228240
/// <summary>

0 commit comments

Comments
 (0)