Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;

namespace SIL.Machine.Corpora
Expand All @@ -22,14 +23,15 @@ ParatextProjectSettings settings
public string UpdateUsfm(
string bookId,
IReadOnlyList<UpdateUsfmRow> rows,
IReadOnlyList<int> chapters = null,
string fullName = null,
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior embedBehavior = UpdateUsfmMarkerBehavior.Preserve,
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
IEnumerable<string> preserveParagraphStyles = null,
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
IEnumerable<string> remarks = null,
IEnumerable<(int, string)> remarks = null,
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
bool compareSegments = false
)
Expand Down Expand Up @@ -59,7 +61,10 @@ public string UpdateUsfm(
);
try
{
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
var tokenizer = new UsfmTokenizer(_settings.Stylesheet);
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
tokens = FilterTokensByChapter(tokens, chapters);
UsfmParser.Parse(tokens, handler, _settings.Stylesheet, _settings.Versification);
return handler.GetUsfm(_settings.Stylesheet);
}
catch (Exception ex)
Expand All @@ -73,6 +78,53 @@ public string UpdateUsfm(
}
}

/// <summary>
/// Filters tokens by the specified chapters.
/// </summary>
/// <param name="tokens">The tokens.</param>
/// <param name="chapters">The chapters. If null, all tokens are returned.</param>
/// <returns>The filtered tokens.</returns>
/// <remarks>This is marked internal so test classes can use it.</remarks>
internal static IReadOnlyList<UsfmToken> FilterTokensByChapter(
IReadOnlyList<UsfmToken> tokens,
IReadOnlyList<int> chapters = null
)
{
if (chapters is null)
return tokens;

var tokensWithinChapters = new List<UsfmToken>();
bool inChapter = false;
bool inIdMarker = false;

for (int index = 0; index < tokens.Count; index++)
{
UsfmToken token = tokens[index];
if (index == 0 && token.Marker == "id")
{
inIdMarker = true;
if (chapters.Contains(1))
inChapter = true;
}
else if (inIdMarker && token.Marker != null && token.Marker != "id")
{
inIdMarker = false;
}
else if (token.Type == UsfmTokenType.Chapter)
{
inChapter =
!string.IsNullOrEmpty(token.Data)
&& int.TryParse(token.Data, out int chapter)
&& chapters.Contains(chapter);
}

if (inIdMarker || inChapter)
tokensWithinChapters.Add(token);
}

return tokensWithinChapters;
}

private bool Exists(string fileName) => _paratextProjectFileHandler.Exists(fileName);

private Stream Open(string fileName) => _paratextProjectFileHandler.Open(fileName);
Expand Down
70 changes: 55 additions & 15 deletions src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
private readonly HashSet<string> _preserveParagraphStyles;
private readonly Stack<UsfmUpdateBlock> _updateBlocks;
private readonly Stack<IUsfmUpdateBlockHandler> _updateBlockHandlers;
private readonly List<string> _remarks;
private readonly List<(int, string)> _remarks;
private readonly Stack<bool> _replace;
private int _tokenIndex;
private readonly Func<UsfmUpdateBlockHandlerException, bool> _errorHandler;
Expand All @@ -76,7 +76,7 @@ public UpdateUsfmParserHandler(
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
IEnumerable<string> preserveParagraphStyles = null,
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
IEnumerable<string> remarks = null,
IEnumerable<(int, string)> remarks = null,
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
bool compareSegments = false
)
Expand Down Expand Up @@ -107,7 +107,7 @@ public UpdateUsfmParserHandler(
preserveParagraphStyles == null
? new HashSet<string> { "r", "rem" }
: new HashSet<string>(preserveParagraphStyles);
_remarks = remarks?.ToList() ?? new List<string>();
_remarks = remarks?.ToList() ?? new List<(int, string)>();
_errorHandler = errorHandler;
if (_errorHandler == null)
_errorHandler = (error) => false;
Expand Down Expand Up @@ -433,26 +433,66 @@ public string GetUsfm(string stylesheetFileName = "usfm.sty")
public string GetUsfm(UsfmStylesheet stylesheet)
{
var tokenizer = new UsfmTokenizer(stylesheet);
List<UsfmToken> tokens = new List<UsfmToken>(_tokens);
if (_remarks.Count() > 0)
var tokens = new List<UsfmToken>(_tokens);
if (_remarks.Count > 0)
{
var remarkTokens = new List<UsfmToken>();
foreach (string remark in _remarks)
var remarkTokensByChapter = new Dictionary<int, List<UsfmToken>>();
foreach ((int chapterNum, string remark) in _remarks)
{
remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null));
remarkTokens.Add(new UsfmToken(remark));
// Add the remark tokens for each chapter that is to have remarks
if (!remarkTokensByChapter.TryGetValue(chapterNum, out List<UsfmToken> chapterTokens))
{
chapterTokens = new List<UsfmToken>();
remarkTokensByChapter.Add(chapterNum, chapterTokens);
}

chapterTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null));
chapterTokens.Add(new UsfmToken(remark));
}
if (tokens.Count > 0)
{
int index = 0;
HashSet<string> markersToSkip = new HashSet<string>() { "id", "ide", "rem" };
while (markersToSkip.Contains(tokens[index].Marker))
foreach (KeyValuePair<int, List<UsfmToken>> remarkTokens in remarkTokensByChapter)
{
index++;
if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text)
int index;
HashSet<string> markersToSkip;
if (remarkTokens.Key == 0)
{
// Add the remarks at the top level of the USFM,
// after the book id, encode, and any initial comments
index = 0;
markersToSkip = new HashSet<string> { "id", "ide", "rem" };
}
else
{
// Add the remarks just after the specified chapter
index = tokens.FindIndex(t =>
t.Type == UsfmTokenType.Chapter
&& int.TryParse(t.Data, out int chapterNumber)
&& chapterNumber == remarkTokens.Key
);
if (index == -1)
continue;
index++;
markersToSkip = new HashSet<string> { "rem" };
}

if (index >= tokens.Count)
{
// The remark insertion point is at the very end
tokens.AddRange(remarkTokens.Value);
}
else
{
while (markersToSkip.Contains(tokens[index].Marker))
{
index++;
if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text)
index++;
}

tokens.InsertRange(index, remarkTokens.Value);
}
}
tokens.InsertRange(index, remarkTokens);
}
}

Expand Down
30 changes: 30 additions & 0 deletions src/SIL.Machine/Corpora/UsfmParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ public static void Parse(
Parse(usfm, handler, new UsfmStylesheet(stylesheetFileName), versification, preserveWhitespace);
}

public static void Parse(
IReadOnlyList<UsfmToken> tokens,
IUsfmParserHandler handler,
string stylesheetFileName = "usfm.sty",
ScrVers versification = null,
bool preserveWhitespace = false
)
{
Parse(tokens, handler, new UsfmStylesheet(stylesheetFileName), versification, preserveWhitespace);
}

public static void Parse(
string usfm,
IUsfmParserHandler handler,
Expand All @@ -45,6 +56,25 @@ public static void Parse(
parser.ProcessTokens();
}

public static void Parse(
IReadOnlyList<UsfmToken> tokens,
IUsfmParserHandler handler,
UsfmStylesheet stylesheet,
ScrVers versification = null,
bool preserveWhitespace = false
)
{
var parser = new UsfmParser(
tokens,
handler,
stylesheet ?? new UsfmStylesheet("usfm.sty"),
versification,
preserveWhitespace
);

parser.ProcessTokens();
}

private static readonly Regex OptBreakSplitter = new Regex("(//)", RegexOptions.Compiled);

public UsfmParser(
Expand Down
41 changes: 41 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/DefaultParatextProjectSettings.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using System.Text;
using SIL.Scripture;

namespace SIL.Machine.Corpora;

public class DefaultParatextProjectSettings(
string id = "Id",
string name = "Test",
string fullName = "TestProject",
Encoding? encoding = null,
ScrVers? versification = null,
UsfmStylesheet? stylesheet = null,
string fileNamePrefix = "",
string fileNameForm = "41MAT",
string fileNameSuffix = "Test.SFM",
string biblicalTermsListType = "Project",
string biblicalTermsProjectName = "Test",
string biblicalTermsFileName = "ProjectBiblicalTerms.xml",
string languageCode = "en",
string translationType = "Standard",
string? parentGuid = null,
string? parentName = null
)
: ParatextProjectSettings(
id,
name,
fullName,
encoding ?? Encoding.UTF8,
versification ?? ScrVers.English,
stylesheet ?? new UsfmStylesheet("usfm.sty"),
fileNamePrefix,
fileNameForm,
fileNameSuffix,
biblicalTermsListType,
biblicalTermsProjectName,
biblicalTermsFileName,
languageCode,
translationType,
parentGuid,
parentName
);
Original file line number Diff line number Diff line change
@@ -1,70 +1,25 @@
using System.Text;
using SIL.Scripture;

namespace SIL.Machine.Corpora;

public class MemoryParatextProjectFileHandler(IDictionary<string, string>? files = null) : IParatextProjectFileHandler
{
public IDictionary<string, string> Files { get; } = files ?? new Dictionary<string, string>();

public UsfmStylesheet CreateStylesheet(string fileName)
{
if (fileName is "usfm.sty" or "usfm_sb.sty")
return new UsfmStylesheet(fileName);
throw new NotImplementedException();
}
public UsfmStylesheet CreateStylesheet(string fileName) =>
fileName is "usfm.sty" or "usfm_sb.sty" ? new UsfmStylesheet(fileName) : throw new NotImplementedException();

public bool Exists(string fileName)
{
return Files.ContainsKey(fileName);
}

public string Find(string extension)
{
throw new NotImplementedException();
}
public string? Find(string extension) => Files.Keys.FirstOrDefault(item => item.EndsWith(extension));

public Stream? Open(string fileName)
{
if (!Files.TryGetValue(fileName, out string? contents))
return null;
return new MemoryStream(Encoding.UTF8.GetBytes(contents));
}

public class DefaultParatextProjectSettings(
string id = "Id",
string name = "Test",
string fullName = "TestProject",
Encoding? encoding = null,
ScrVers? versification = null,
UsfmStylesheet? stylesheet = null,
string fileNamePrefix = "",
string fileNameForm = "41MAT",
string fileNameSuffix = "Test.SFM",
string biblicalTermsListType = "Project",
string biblicalTermsProjectName = "Test",
string biblicalTermsFileName = "ProjectBiblicalTerms.xml",
string languageCode = "en",
string translationType = "Standard",
string? parentGuid = null,
string? parentName = null
)
: ParatextProjectSettings(
id,
name,
fullName,
encoding ?? Encoding.UTF8,
versification ?? ScrVers.English,
stylesheet ?? new UsfmStylesheet("usfm.sty"),
fileNamePrefix,
fileNameForm,
fileNameSuffix,
biblicalTermsListType,
biblicalTermsProjectName,
biblicalTermsFileName,
languageCode,
translationType,
parentGuid,
parentName
) { }
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ namespace SIL.Machine.Corpora;
public class MemoryParatextProjectTermsParser(IDictionary<string, string>? files, ParatextProjectSettings? settings)
: ParatextProjectTermsParserBase(
new MemoryParatextProjectFileHandler(files),
settings ?? new MemoryParatextProjectFileHandler.DefaultParatextProjectSettings()
settings ?? new DefaultParatextProjectSettings()
) { }
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
namespace SIL.Machine.Corpora;

public class MemoryParatextProjectTextUpdater(
IDictionary<string, string>? files = null,
ParatextProjectSettings? settings = null
)
: ParatextProjectTextUpdaterBase(
new MemoryParatextProjectFileHandler(files),
settings ?? new DefaultParatextProjectSettings()
);
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ public class MemoryParatextProjectVersificationErrorDetector(
)
: ParatextProjectVersificationErrorDetectorBase(
new MemoryParatextProjectFileHandler(files),
settings ?? new MemoryParatextProjectFileHandler.DefaultParatextProjectSettings()
) { }
settings ?? new DefaultParatextProjectSettings()
);
Loading
Loading