ASStrFcns.pas

File name
C:\Users\Andreas Rejbrand\Documents\Dev\AlgoSim\ASStrFcns.pas
Date exported
Time exported
Formatting processor
TPascalFormattingProcessor
unit ASStrFcns;

{ **************************************************************************** }
{ Rejbrand AlgoSim string functions                                            }
{ Copyright © 2018-2000 Andreas Rejbrand                                       }
{ https://english.rejbrand.se/                                                 }
{ **************************************************************************** }

{$WARN SYMBOL_PLATFORM OFF}
{$WARN DUPLICATE_CTOR_DTOR OFF}

interface

uses
  SysUtils, Classes, Generics.Defaults, Generics.Collections;

type
  TStringSearchOption = (ssoIgnoreCase, ssoWholeWords);
  TStringSearchOptions = set of TStringSearchOption;

function SubstringCount(const ASubstr, AStr: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOffset: Integer = 0;
  AList: TList<Integer> = nil): Integer;

function SubstringFirstIndex(const ASubstr, AStr: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOffset: Integer = 0): Integer;

function StringReplace(const AStr, APattern, ANewText: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOffset: Integer = 0): string;

type
  TChrTransformFunc = reference to function(C: Char): Char;
  TTextTransformFunc = reference to function(const AText: string): string;

function TextTransformFunc(AChrTransformFunc: TChrTransformFunc): TTextTransformFunc;

function ChrUpperCase(C: Char): Char;
function ChrLowerCase(C: Char): Char;
function ChrInvertCase(C: Char): Char;
function TxtTitleCase(const AText: string): string;
function TxtSentenceCase(const AText: string): string;
function ChrROT13(C: Char): Char;
function ChrCaesar(N: Integer): TChrTransformFunc;
function TxtVigenère(const AKey: string; ADecode: Boolean = False): TTextTransformFunc;

type
  TWordExtractionOptions = record
    WordSeps: string;
    MathSeps: Boolean;
    TrimChrs: string;
    TrimPunct: Boolean;
    LetterRequired: Boolean;
    LetterOrDigitRequired: Boolean;
  end;

const
  WEO_ENGLISH: TWordExtractionOptions =
    (
      WordSeps: '/<>='#$2014;
      MathSeps: False;
      TrimChrs: '';
      TrimPunct: True;
      LetterRequired: True;
      LetterOrDigitRequired: False;
    );

  WEO_ENGLISH_MATH: TWordExtractionOptions =
    (
      WordSeps: '/<>='#$2014'()[]{}';
      MathSeps: True;
      TrimChrs: '';
      TrimPunct: True;
      LetterRequired: True;
      LetterOrDigitRequired: False;
    );

  WEO_ENGLISH_SOURCECODE: TWordExtractionOptions =
    (
      WordSeps: '/<>='#$2014'.,;:-+*()[]{}|\%&@$~!?^';
      MathSeps: True;
      TrimChrs: '';
      TrimPunct: True;
      LetterRequired: True;
      LetterOrDigitRequired: False;
    );

procedure GetWords(const AText: string; const AOptions: TWordExtractionOptions;
  AWordList: TList<string>); overload;

function GetWords(const AText: string; const AOptions: TWordExtractionOptions): TArray<string>; overload;

function WordWrap(const AText: string; N: Integer = 80): string;

function Pad(const AString: string; ALength: Integer;
  AAlignment: TAlignment; APadding: Char = #32): string;

function RemoveDuplicates(const AString: string): string;
function RemoveAdjacentDuplicates(const AString: string): string;

implementation

uses
  Math, StrUtils, Character;

function SubstringScan(const ASubstr, AStr: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOnlyFirst: Boolean = False;
  AOffset: Integer = 0;
  AList: TList<Integer> = nil): Integer;
var
  Substr, Str: string;
  LastIndex, i, j: Integer;

  function TestWholeWord: Boolean;
  begin
    Result :=
      not (ssoWholeWords in AStringSearchOptions) or
      (
        ((i = 1) or not Str[i - 1].IsLetterOrDigit) and
        ((i + j = AStr.Length + 1) or not (Str[i + j]).IsLetterOrDigit)
      )
  end;

begin

  if Assigned(AList) then
    AList.Clear;

  Result := 0;

  if ASubstr.IsEmpty then Exit;

  LastIndex := AStr.Length - ASubstr.Length + 1;

  if ssoIgnoreCase in AStringSearchOptions then
  begin
    Substr := AnsiLowerCase(ASubstr);
    Str := AnsiLowerCase(AStr);
  end
  else
  begin
    Substr := ASubstr;
    Str := AStr;
  end;

  i := 1 + AOffset;
  while i <= LastIndex do
  begin
    j := 0;
    while (j < Substr.Length) and (Str[i + j] = Substr[1 + j]) do Inc(j);
    if (j = Substr.Length) and TestWholeWord then
    begin
      if AOnlyFirst then
        Exit(i);
      if Assigned(AList) then
        AList.Add(i);
      Inc(Result);
      Inc(i, j);
    end
    else
      Inc(i);
  end;

end;

function SubstringCount(const ASubstr, AStr: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOffset: Integer = 0;
  AList: TList<Integer> = nil): Integer;
begin
  Result := SubstringScan(ASubstr, AStr, AStringSearchOptions, False,
    AOffset, AList);
end;

function SubstringFirstIndex(const ASubstr, AStr: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOffset: Integer = 0): Integer;
begin
  Result := SubstringScan(ASubstr, AStr, AStringSearchOptions, True, AOffset)
end;

function StringReplace(const AStr, APattern, ANewText: string;
  AStringSearchOptions: TStringSearchOptions = [];
  AOffset: Integer = 0): string;
var
  indices: TList<Integer>;
  diff, chrcnt: Integer;
  i, y: Integer;
begin

  indices := TList<Integer>.Create;
  try

    if SubstringCount(APattern, AStr, AStringSearchOptions, AOffset, indices) = 0 then
      Exit(AStr);

    diff := ANewText.Length - APattern.Length;

    SetLength(Result, AStr.Length + diff * indices.Count);

    // indices.Count >= 1

    // Initial part, before indices[0]
    y := 1;
    if indices[0] > 1 then
      Move(AStr[1], Result[y], (indices[0] - 1) * sizeof(Char));
    y := indices[0];
    if not ANewText.IsEmpty then
      Move(ANewText[1], Result[y], ANewText.Length * sizeof(Char));
    Inc(y, ANewText.Length);

    // The part between indices[i] and indices[i + 1]
    for i := 0 to indices.Count - 2 do
    begin
      chrcnt := (indices[i + 1] - indices[i] - APattern.Length);
      if chrcnt > 0 then
        Move(AStr[indices[i] + APattern.Length], Result[y], chrcnt * sizeof(Char));
      Inc(y, chrcnt);
      if not ANewText.IsEmpty then
        Move(ANewText[1], Result[y], ANewText.Length * sizeof(Char));
      Inc(y, Length(ANewText));
    end;

    // Final part, after indices[indices.Count - 1]
    chrcnt := AStr.Length - (indices[indices.Count - 1] + APattern.Length) + 1;
    if chrcnt > 0 then
      Move(AStr[indices[indices.Count - 1] + APattern.Length], Result[y],
        chrcnt * sizeof(Char));

  finally
    indices.Free;
  end;

end;

function TextTransformFunc(AChrTransformFunc: TChrTransformFunc): TTextTransformFunc;
begin
  Result := function(const AText: string): string
    var
      i: Integer;
    begin
      SetLength(Result, AText.Length);
      for i := 1 to AText.Length do
        Result[i] := AChrTransformFunc(AText[i]);
    end;
end;

function imod(const x: Integer; const y: Integer): Integer;
begin
  if x >= 0 then
    imod := x - floor(x/y) * y
  else
    imod := x + ceil(-x/y) * y;
end;

function ChrUpperCase(C: Char): Char;
begin
  Result := AnsiUpperCase(C)[1];
end;

function ChrLowerCase(C: Char): Char;
begin
  Result := AnsiLowerCase(C)[1];
end;

function ChrInvertCase(C: Char): Char;
begin
  if C.IsUpper then
    Result := AnsiLowerCase(C)[1]
  else
    Result := AnsiUpperCase(C)[1];
end;

function TxtTitleCase(const AText: string): string;
var
  i: Integer;
  StartOfWord: Boolean;
begin
  StartOfWord := True;
  Result := AText;
  for i := 1 to AText.Length do
  begin
    if StartOfWord and AText[i].IsLetter then
    begin
      Result[i] := AnsiUpperCase(AText[i])[1];
      StartOfWord := False;
    end
    else if AText[i].IsWhiteSpace then
      StartOfWord := True;
  end;
end;

function TxtSentenceCase(const AText: string): string;
var
  StartOfSentence: Boolean;
  i: Integer;
begin
  StartOfSentence := True;
  Result := AText;
  for i := 1 to AText.Length do
    if StartOfSentence and AText[i].IsLetter then
    begin
      Result[i] := AnsiUpperCase(AText[i])[1];
      StartOfSentence := False;
    end
    else if CharInSet(AText[i], ['.', '!', '?']) or (AText[i] = '‽') then
      StartOfSentence := True;
end;

function ChrROT13(C: Char): Char;
begin
  if InRange(ord(C), ord('A'), ord('Z')) then
    Result := Chr(ord('A') + (ord(C) - ord('A') + 13) mod 26)
  else if InRange(ord(C), ord('a'), ord('z')) then
    Result := Chr(ord('a') + (ord(C) - ord('a') + 13) mod 26)
  else
    Result := C;
end;

function ChrCaesar(N: Integer): TChrTransformFunc;
begin
  Result := function(C: Char): Char
    begin
      if InRange(ord(C), ord('A'), ord('Z')) then
        Result := Chr(ord('A') + imod((ord(C) - ord('A') + N), 26))
      else if InRange(ord(C), ord('a'), ord('z')) then
        Result := Chr(ord('a') + imod((ord(C) - ord('a') + N), 26))
      else
        Result := C;
    end;
end;

function TxtVigenère(const AKey: string; ADecode: Boolean = False): TTextTransformFunc;
var
  n: Integer;
  KeyChrs: array of Byte;
  i: Integer;
  factor: Integer;
begin

  n := AKey.Length;

  SetLength(KeyChrs, n);

  for i := 1 to n do
    if InRange(ord(AKey[i]), ord('A'), ord('Z')) then
      KeyChrs[i - 1] := ord(AKey[i]) - ord('A')
    else
      raise Exception.Create('Invalid character in Vigenère key.');

  if n = 0 then
    raise Exception.Create('Vigenère key is empty.');

  factor := IfThen(ADecode, -1, 1);

  Result := function(const AText: string): string
    var
      j: Integer;
    begin
      SetLength(Result, AText.Length);
      for j := 1 to AText.Length do
      begin
        if InRange(ord(AText[j]), ord('A'), ord('Z')) then
          Result[j] := Chr(ord('A') + imod(ord(AText[j]) - ord('A') + factor * KeyChrs[(j - 1) mod n], 26))
        else if InRange(ord(AText[j]), ord('a'), ord('z')) then
          Result[j] := Chr(ord('a') + imod(ord(AText[j]) - ord('a') + factor * KeyChrs[(j - 1) mod n], 26))
        else
          Result[j] := AText[j];
      end;
    end;

end;

function ContainsChar(const AText: string; AChr: Char): Boolean;
var
  i: Integer;
begin
  for i := 1 to AText.Length do
    if AText[i] = AChr then
      Exit(True);
  Result := False;
end;

procedure GetWords(const AText: string; const AOptions: TWordExtractionOptions;
  AWordList: TList<string>);

  function IsWordSep(AChr: Char): Boolean;
  begin
    Result := AChr.IsWhiteSpace or
      (AOptions.MathSeps and (AChr.GetUnicodeCategory = TUnicodeCategory.ucMathSymbol)) or
      ContainsChar(AOptions.WordSeps, AChr);
  end;

  function IsTrimChar(AChr: Char): Boolean;
  begin
    Result := (AOptions.TrimPunct and AChr.IsPunctuation) or
      ContainsChar(AOptions.TrimChrs, AChr);
  end;

  procedure ConsiderCandidate(AStart, AEnd: Integer);
  var
    HasReqChr: Boolean;
    j: Integer;
  begin

    // Preconditions:
    // AText[AStart] exists and is the first character of the proposed word.
    // AText[AEnd] exists and is the last character of the proposed word.
    // AStart <= AEnd

    {$IFDEF DEBUG}
    Assert(InRange(AStart, 1, AText.Length));
    Assert(InRange(AEnd, 1, AText.Length));
    Assert(AStart <= AEnd);
    {$ENDIF}

    while (AStart <= AEnd) and IsTrimChar(AText[AStart]) do
      Inc(AStart);

    while (AEnd >= AStart) and IsTrimChar(AText[AEnd]) do
      Dec(AEnd);

    if AStart > AEnd then
      Exit;

    if AOptions.LetterRequired then
    begin
      HasReqChr := False;
      for j := AStart to AEnd do
        if AText[j].IsLetter then
        begin
          HasReqChr := True;
          Break;
        end;
    end
    else if AOptions.LetterOrDigitRequired then
    begin
      HasReqChr := False;
      for j := AStart to AEnd do
        if AText[j].IsLetterOrDigit then
        begin
          HasReqChr := True;
          Break;
        end;
    end
    else
      HasReqChr := True;

    if not HasReqChr then
      Exit;

    AWordList.Add(Copy(AText, AStart, AEnd - AStart + 1));

  end;

var
  InProposedWord: Boolean;
  Start: Integer;
  i: Integer;

begin

  if AWordList = nil then
    raise Exception.Create('GetWords: word list is nil.');

  InProposedWord := False;
  Start := 1;
  for i := 1 to AText.Length do
    if InProposedWord then
    begin
      if IsWordSep(AText[i]) then
      begin
        InProposedWord := False;
        ConsiderCandidate(Start, i - 1);
      end;
    end
    else
    begin
      if not IsWordSep(AText[i]) then
      begin
        Start := i;
        InProposedWord := True;
      end;
    end;

  if InProposedWord {at EOS} then
    ConsiderCandidate(Start, AText.Length);

end;

function GetWords(const AText: string; const AOptions: TWordExtractionOptions): TArray<string>;
var
  List: TList<string>;
begin
  List := TList<string>.Create;
  try
    GetWords(AText, AOptions, List);
    Result := List.ToArray;
  finally
    List.Free;
  end;
end;

function WordWrap(const AText: string; N: Integer = 80): string;

  function IndexChr(const AChr: Char; AValues: array of Char): Integer;
  var
    i: Integer;
  begin
    for i := 0 to High(AValues) do
      if AChr = AValues[i] then
        Exit(i);
    Result := -1;
  end;

  function ChrAnyOf(const AChr: Char; AValues: array of Char): Boolean;
  begin
    Result := IndexChr(AChr, AValues) <> -1;
  end;

const
  NBSP = #$A0;
  NBHYP = #$2011;

  function IsNbrChr(const AChr: Char): Boolean; inline;
  begin
    Result := ChrAnyOf(AChr, [NBSP, NBHYP]);
  end;

  function IsWordChar(const AChr: Char): Boolean; inline;
  begin
    Result := AChr.IsLetterOrDigit or (IndexChr(AChr, ['''', '’']) <> -1);
  end;

  function CanBreakAfter(AIndex: Integer): Boolean;
  begin
    if AIndex >= AText.Length then
      Exit(True);
    if AIndex < 1 then
      Exit(True);
    if IsWordChar(AText[AIndex]) and (IsWordChar(AText[AIndex + 1]) or (IndexChr(AText[AIndex + 1], ['.', ',', ':', ';', '!', '?', '-', '‐']) <> -1)) then
      Exit(False);
    if IsNbrChr(AText[AIndex]) or IsNbrChr(AText[AIndex + 1]) then
      Exit(False);
    Result := True;
  end;

var
  SB: TStringBuilder;
  p, pp, newpp, p2, c: Integer;
begin

  SB := TStringBuilder.Create(
    AText.Length + Length(sLineBreak) * (AText.Length div Max(N-5, 1)) // Guess capacity
  );
  try

    p := 1;
    pp := 1;
    c := 0;
    while p <= AText.Length do
    begin

      if (AText[p] = #13) or (AText[p] = #10) then // explicit line break
      begin
        p2 := p - 1;
        while (p2 >= pp) and AText[p2].IsWhiteSpace do
          Dec(p2);
        if p2 >= pp then
          SB.Append(AText, pp - 1, p2 - pp + 1);
        SB.AppendLine;
        if (AText[p] = #13) and (p < AText.Length) and (AText[p + 1] = #10) then // CRLF (two chars)
          Inc(p);
        Inc(p);
        pp := p;
        c := 0;
        Continue;
      end;

      if c = N then
      begin
        // Already N chars on the current line; the new character (which isn't a linebreak) won't fit.
        // We need to add a linebreak before this new character.
        p2 := p - 1;
        Assert(p2 >= pp);
        while (p2 >= pp) and not CanBreakAfter(p2) do
          Dec(p2);
        if p2 < pp then // forced to break inside a word
          p2 := p - 1;
        newpp := p2 + 1;
        if AText[newpp].IsWhiteSpace and (newpp < AText.Length) and not AText[newpp + 1].IsWhiteSpace then
          Inc(newpp);
        while (p2 >= pp) and AText[p2].IsWhiteSpace do
          Dec(p2);
        if p2 >= pp then
          SB.Append(AText, pp - 1, p2 - pp + 1);
        SB.AppendLine;
        pp := newpp;
        p := pp;
        c := 0;
        Continue;
      end;

      if p = AText.Length then
        SB.Append(AText, pp - 1, p - pp + 1);

      Inc(p);
      Inc(c);

    end;

    Result := SB.ToString;

  finally
    SB.Free;
  end;

end;

function Pad(const AString: string; ALength: Integer; AAlignment: TAlignment;
  APadding: Char): string;
var
  N, L, R: Integer;
begin

  if AString.Length >= ALength then
    Exit(AString);

  case AAlignment of
    taLeftJustify:
      Result := AString + StringOfChar(APadding, ALength - AString.Length);
    taRightJustify:
      Result := StringOfChar(APadding, ALength - AString.Length) + AString;
    taCenter:
      begin
        N := ALength - AString.Length;
        L := N div 2;
        R := N - L;
        Result := StringOfChar(APadding, L) + AString + StringOfChar(APadding, R);
      end;
  else
    Result := AString;
  end;

end;

function RemoveDuplicates(const AString: string): string;
var
  ActualLength: Integer;
  i: Integer;
  Chars: TDictionary<Char, Pointer>;
begin

  Chars := TDictionary<Char, Pointer>.Create;
  try

    SetLength(Result, AString.Length);
    ActualLength := 0;
    for i := 1 to AString.Length do
      if not Chars.ContainsKey(AString[i]) then
      begin
        Inc(ActualLength);
        Result[ActualLength] := AString[i];
        Chars.Add(AString[i], nil);
      end;

    SetLength(Result, ActualLength);

  finally
    Chars.Free;
  end;

end;

function RemoveAdjacentDuplicates(const AString: string): string;
var
  ActualLength: Integer;
  i: Integer;
begin

  SetLength(Result, AString.Length);
  ActualLength := 0;

  for i := 1 to AString.Length do
    if (i = 1) or (AString[i] <> AString[Pred(i)]) then
    begin
      Inc(ActualLength);
      Result[ActualLength] := AString[i];
    end;

  SetLength(Result, ActualLength);

end;

end.