Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .build/dependencies.props
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
https://github.com/apache/lucene-solr/tree/31d7ec7bbfdcd2c4cc61d9d35e962165410b65fe/lucene/analysis/icu/src/data/utr30
Just make sure they are adjusted to the right version of ICU/Lucene.
<ICU4NPackageVersion>[60.1,60.2)</ICU4NPackageVersion> -->
<ICU4NPackageVersion>[60.1.0-alpha.436,60.1.0-alpha.446)</ICU4NPackageVersion>
<ICU4NPackageVersion>[60.1.0-alpha.438,60.1.0-alpha.446)</ICU4NPackageVersion>
<IKVMPackageVersion>8.7.5</IKVMPackageVersion>
<IKVMMavenSdkPackageVersion>1.6.7</IKVMMavenSdkPackageVersion>
<!-- J2N will break binary compatibility in 3.0.0 to fix the APIs of collection types -->
Expand Down
2 changes: 1 addition & 1 deletion .build/runbuild.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ properties {
[string]$testResultsDirectory = "$artifactsDirectory/TestResults"
[string]$publishDirectory = "$artifactsDirectory/Publish"
[string]$solutionFile = "$baseDirectory/Lucene.Net.sln"
[string]$minimumSdkVersion = "9.0.100"
[string]$minimumSdkVersion = "9.0.200" # We need at least 9.0.200 for ICU4N satellite assemblies with 3-character language codes to get copied to the output.
[string]$globalJsonFile = "$baseDirectory/global.json"
[string]$versionPropsFile = "$baseDirectory/version.props"
[string]$luceneReadmeFile = "$baseDirectory/src/Lucene.Net/readme-nuget.md"
Expand Down
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ variables:
- name: BuildCounter
value: $[counter(variables['VersionSuffix'],coalesce(variables['BuildCounterSeed'], 1250))]
- name: DotNetSDKVersion
value: '9.0.100'
value: '9.0.300'
- name: DocumentationArtifactName
value: 'docs'
- name: DocumentationArtifactZipFileName
Expand Down
143 changes: 26 additions & 117 deletions src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,41 +46,13 @@ namespace Lucene.Net.Analysis.Th
/// </summary>
public class ThaiTokenizer : SegmentingTokenizerBase
{
private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe (LUCENENET TODO: TO REVERT)

// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
private static readonly BreakIterator proto = LoadProto();
private static readonly BreakIterator proto = BreakIterator.GetWordInstance(new CultureInfo("th"));

/// <summary>
/// used for breaking the text into sentences
/// </summary>
private static readonly BreakIterator sentenceProto = LoadSentenceProto();

private static BreakIterator LoadProto()
{
UninterruptableMonitor.Enter(syncLock);
try
{
return BreakIterator.GetWordInstance(new CultureInfo("th"));
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
}

private static BreakIterator LoadSentenceProto()
{
UninterruptableMonitor.Enter(syncLock);
try
{
return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
}
private static readonly BreakIterator sentenceProto = BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);

private readonly ThaiWordBreaker wordBreaker;
private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
Expand All @@ -101,112 +73,48 @@ public ThaiTokenizer(TextReader reader)
/// <summary>
/// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="AttributeFactory"/> </summary>
public ThaiTokenizer(AttributeFactory factory, TextReader reader)
: base(factory, reader, CreateSentenceClone())
: base(factory, reader, (BreakIterator)sentenceProto.Clone())
{
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

UninterruptableMonitor.Enter(syncLock);
try
{
wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}

private static BreakIterator CreateSentenceClone()
{
UninterruptableMonitor.Enter(syncLock);
try
{
return (BreakIterator)sentenceProto.Clone();
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
}

public override void Reset()
{
UninterruptableMonitor.Enter(syncLock);
try
{
base.Reset();
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
}

public override State CaptureState()
{
UninterruptableMonitor.Enter(syncLock);
try
{
return base.CaptureState();
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
}

protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
{
UninterruptableMonitor.Enter(syncLock);
try
{
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
wordBreaker.SetText(wrapper);
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
wordBreaker.SetText(wrapper);
}

protected override bool IncrementWord()
{
int start, end;
UninterruptableMonitor.Enter(syncLock);
try
start = wordBreaker.Current;
if (start == BreakIterator.Done)
{
start = wordBreaker.Current;
if (start == BreakIterator.Done)
{
return false; // BreakIterator exhausted
}
return false; // BreakIterator exhausted
}

// find the next set of boundaries, skipping over non-tokens
// find the next set of boundaries, skipping over non-tokens
end = wordBreaker.Next();
while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
{
start = end;
end = wordBreaker.Next();
while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
{
start = end;
end = wordBreaker.Next();
}

if (end == BreakIterator.Done)
{
return false; // BreakIterator exhausted
}

ClearAttributes();
termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
return true;
}
finally

if (end == BreakIterator.Done)
{
UninterruptableMonitor.Exit(syncLock);
return false; // BreakIterator exhausted
}

ClearAttributes();
termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
return true;
}
}

Expand All @@ -231,6 +139,7 @@ public void SetText(CharArrayIterator text)
{
this.text.CopyChars(text.Text, text.Start, text.Length);
wordBreaker.SetText(text);
transitions.Clear();
}

public int Current
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ public sealed class ICUTokenizer : Tokenizer
private readonly ITypeAttribute typeAtt;
private readonly IScriptAttribute scriptAtt;

private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe (LUCENENET TODO: TO REVERT)

/// <summary>
/// Construct a new <see cref="ICUTokenizer"/> that breaks text into words from the given
/// <see cref="TextReader"/>.
Expand Down Expand Up @@ -112,39 +110,23 @@ public ICUTokenizer(AttributeFactory factory, TextReader input, ICUTokenizerConf

public override bool IncrementToken()
{
UninterruptableMonitor.Enter(syncLock);
try
{
ClearAttributes();
if (length == 0)
Refill();
while (!IncrementTokenBuffer())
{
Refill();
if (length <= 0) // no more bytes to read;
return false;
}
return true;
}
finally
ClearAttributes();
if (length == 0)
Refill();
while (!IncrementTokenBuffer())
{
UninterruptableMonitor.Exit(syncLock);
Refill();
if (length <= 0) // no more bytes to read;
return false;
}
return true;
}


public override void Reset()
{
base.Reset();
UninterruptableMonitor.Enter(syncLock);
try
{
breaker.SetText(buffer, 0, 0);
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
breaker.SetText(buffer, 0, 0);
length = usableLength = offset = 0;
}

Expand Down Expand Up @@ -206,15 +188,7 @@ private void Refill()
*/
}

UninterruptableMonitor.Enter(syncLock);
try
{
breaker.SetText(buffer, 0, Math.Max(0, usableLength));
}
finally
{
UninterruptableMonitor.Exit(syncLock);
}
breaker.SetText(buffer, 0, Math.Max(0, usableLength));
}

// TODO: refactor to a shared readFully somewhere
Expand Down
Loading