striprtf(text) function (translated from Python)
# translated from https://stackoverflow.com/questions/188545/regular-expression-for-extracting-text-from-an-rtf-string
# and https://stackoverflow.com/questions/44580580/how-to-convert-rtf-string-to-plain-text-in-python-using-any-library
let pattern = r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)"i,
destinations = Set{String}([
"aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid",
"atnparent","atnref","atntime","atrfend","atrfstart","author","background",
"bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping",
"colortbl","comment","company","creatim","datafield","datastore","defchp","defpap",
"do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt",
"fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl",
"ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype",
"fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr",
"footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl",
"header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc",
"hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers",
"leveltext","lfolevel","linkval","list","listlevel","listname","listoverride",
"listoverridetable","listpicture","liststylename","listtable","listtext",
"lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr",
"mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr",
"mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me",
"mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr",
"mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag",
"mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname",
"mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr",
"mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject",
"mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname",
"mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl",
"mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr",
"mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu",
"mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr",
"mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup",
"msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide",
"msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol",
"mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables",
"objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops",
"oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password",
"passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta",
"pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe",
"result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst",
"shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv",
"svb","tc","template","themedata","title","txe","ud","upr","userprops",
"wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform",
"xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl",
"xmlopen" ]),
specialchars = Dict{String,String}([
"par" => "\n",
"sect" => "\n\n",
"page" => "\n\n",
"line" => "\n",
"tab" => "\t",
"emdash" => "\u2014",
"endash" => "\u2013",
"emspace" => "\u2003",
"enspace" => "\u2002",
"qmspace" => "\u2005",
"bullet" => "\u2022",
"lquote" => "\u2018",
"rquote" => "\u2019",
"ldblquote" => "\201C",
"rdblquote" => "\u201D" ])
global striprtf
function striprtf(text::AbstractString)
stack = Tuple{Int,Bool}[]
ignorable = false # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
out = IOBuffer() # Output buffer.
for match in eachmatch(pattern, text)
word,arg,hex,char,brace,tchar = match.captures
if brace !== nothing
curskip = 0
if brace == "{"
# Push state
push!(stack, (ucskip,ignorable))
elseif brace == "}"
# Pop state
ucskip,ignorable = pop!(stack)
end
elseif char !== nothing # \x (not a letter)
curskip = 0
ch = only(char)
if ch == '~'
!ignorable && print(out, '\ua0')
elseif ch in "{}\\"
!ignorable && print(out, char)
elseif ch == '*'
ignorable = true
end
elseif word !== nothing # \foo
curskip = 0
if word in destinations
ignorable = true
elseif ignorable
nothing
elseif word in keys(specialchars)
print(out, specialchars[word])
elseif word == "uc"
ucskip = parse(Int, arg)
elseif word == "u"
c = parse(Int, arg)
c < 0 && (c += 0x10000)
print(out, Char(c))
curskip = ucskip
end
elseif hex !== nothing # \'xx
if curskip > 0
curskip -= 1
elseif !ignorable
c = parse(Int, hex, base=16)
print(out, Char(c))
end
elseif tchar !== nothing
if curskip > 0
curskip -= 1
elseif !ignorable
print(out, tchar)
end
end
end
return String(take!(out))
end
end