Here’s my analysis to follow-up on @Tamas_Papp’s smart suggestion for doing frequency analysis.
I will preface this by stating that there are a ton of things wrong with the analysis I am doing:
- Only inline comments removed, but not blocks
- Strings aren’t excluded (which is why
|,\and'are pretty high) - Some symbols overlap, so to fix this I literally just subtract the count of the small symbols from any matching large symbols.
- Some symbols can be used in function names (
!) Unionalso appears asUnionAll, so I used a regexp boundary- I then manually remove characters from the list which are undefined (presumably used in comments?) or are just prefix/suffix operators
The “proper” way to do this would be directly on the AST rather than strings.
This list is the most common infix operators in /base and /stdlib (excluding ones with alphabetical characters, like in or isa), with Union inserted for comparison:
| Symbol | Count |
|---|---|
| = | 79141 |
| . | 73990 |
| : | 66847 |
| :: | 38905 |
| - | 16897 |
| ! | 15273 |
| == | 12940 |
| > | 11865 |
| \ | 11684 |
| ’ | 9779 |
| * | 9215 |
| + | 7367 |
| & | 7155 |
| && | 6421 |
| | | 4618 |
| <: | 4188 |
| / | 4147 |
| === | 3496 |
| < | 2719 |
| Union | 2457 |
| % | 2202 |
| -> | 2045 |
| ≈ | 1821 |
| != | 1499 |
| += | 1242 |
| ^ | 1234 |
| <= | 1199 |
| !== | 800 |
| << | 658 |
| >= | 528 |
| // | 401 |
-= |
358 |
| >> | 337 |
| ~ | 228 |
| ≤ | 191 |
| |= | 170 |
| *= | 156 |
| ∈ | 140 |
| … | 133 |
| |> | 112 |
| >>> | 96 |
| ∉ | 88 |
| ≥ | 87 |
| ÷ | 82 |
| &= | 71 |
| ∘ | 61 |
| ⊻ | 59 |
| ≠ | 53 |
| >: | 47 |
| ≡ | 39 |
| ⊻= | 36 |
| >>= | 29 |
| √ | 29 |
| ⊆ | 29 |
| ⊊ | 26 |
| /= | 21 |
| <<= | 19 |
| ∪ | 17 |
| ⊈ | 17 |
| ⋮ | 17 |
| … | 16 |
| >>>= | 14 |
| ⊇ | 14 |
| ⊋ | 13 |
| ⊉ | 12 |
| ∩ | 11 |
| ⊽ | 11 |
| ≉ | 10 |
| ⊼ | 10 |
| ∋ | 9 |
| ∌ | 7 |
| ≢ | 7 |
| ^= | 6 |
I’ll leave the interpretation to the reader…
Here’s my quick-and-dirty script:
(expand)
using DataFrames: DataFrame
symbols = Union{Regex,String}[r"\bUnion\b", "!", "!=", "!==", "::", "\$", "\$=", "%", "%=", "&", "&&", "&=", "'", "*", "*=", "+", "++", "+=", "-", "-->", "-=", "->", ".", "..", "...", ".<|", ".|>", "/", "//", "//=", "/=", ":", ":=", "<", "<--", "<-->", "<:", "<<", "<<=", "<=", "<|", "=", "==", "===", ">", ">:", ">=", ">>", ">>=", ">>>", ">>>=", "\\", "\\=", "^", "^=", "|", "|=", "|>", "||", "~", "¦", "¬", "±", "·", "×", "÷", "÷=", "·", "…", "⁝", "⅋", "←", "↑", "→", "↓", "↔", "↚", "↛", "↜", "↝", "↞", "↠", "↢", "↣", "↤", "↦", "↩", "↪", "↫", "↬", "↮", "↶", "↷", "↺", "↻", "↼", "↽", "⇀", "⇁", "⇄", "⇆", "⇇", "⇉", "⇋", "⇌", "⇍", "⇎", "⇏", "⇐", "⇒", "⇔", "⇚", "⇛", "⇜", "⇝", "⇠", "⇢", "⇴", "⇵", "⇶", "⇷", "⇸", "⇹", "⇺", "⇻", "⇼", "⇽", "⇾", "⇿", "∈", "∉", "∊", "∋", "∌", "∍", "−", "−=", "∓", "∔", "∗", "∘", "∙", "√", "∛", "∜", "∝", "∤", "∥", "∦", "∧", "∨", "∩", "∪", "∷", "∸", "∺", "∻", "∽", "∾", "≀", "≁", "≂", "≃", "≄", "≅", "≆", "≇", "≈", "≉", "≊", "≋", "≌", "≍", "≎", "≏", "≐", "≑", "≒", "≓", "≔", "≕", "≖", "≗", "≘", "≙", "≚", "≛", "≜", "≝", "≞", "≟", "≠", "≡", "≢", "≣", "≤", "≥", "≦", "≧", "≨", "≩", "≪", "≫", "≬", "≭", "≮", "≯", "≰", "≱", "≲", "≳", "≴", "≵", "≶", "≷", "≸", "≹", "≺", "≻", "≼", "≽", "≾", "≿", "⊀", "⊁", "⊂", "⊃", "⊄", "⊅", "⊆", "⊇", "⊈", "⊉", "⊊", "⊋", "⊍", "⊎", "⊏", "⊐", "⊑", "⊒", "⊓", "⊔", "⊕", "⊖", "⊗", "⊘", "⊙", "⊚", "⊛", "⊜", "⊞", "⊟", "⊠", "⊡", "⊩", "⊬", "⊮", "⊰", "⊱", "⊲", "⊳", "⊴", "⊵", "⊶", "⊷", "⊻", "⊻=", "⊼", "⊽", "⋄", "⋅", "⋆", "⋇", "⋉", "⋊", "⋋", "⋌", "⋍", "⋎", "⋏", "⋐", "⋑", "⋒", "⋓", "⋕", "⋖", "⋗", "⋘", "⋙", "⋚", "⋛", "⋮", "⋯", "⋰", "⋱", "⌿", "▷", "⟇", "⟑", "⟕", "⟖", "⟗", "⟰", "⟱", "⟵", "⟶", "⟷", "⟹", "⟺", "⟻", "⟼", "⟽", "⟾", "⟿", "⤀", "⤁", "⤂", "⤃", "⤄", "⤅", "⤆", "⤇", "⤈", "⤉", "⤊", "⤋", "⤌", "⤍", "⤎", "⤏", "⤐", "⤑", "⤒", "⤓", "⤔", "⤕", "⤖", "⤗", "⤘", "⤝", "⤞", "⤟", "⤠", "⥄", "⥅", "⥆", "⥇", "⥈", "⥉", "⥊", "⥋", "⥌", "⥍", "⥎", "⥏", "⥐", "⥑", "⥒", "⥓", "⥔", "⥕", "⥖", "⥗", "⥘", "⥙", "⥚", "⥛", "⥜", "⥝", "⥞", "⥟", "⥠", "⥡", "⥢", "⥣", "⥤", "⥥", "⥦", "⥧", "⥨", "⥩", "⥪", "⥫", "⥬", "⥭", "⥮", "⥯", "⥰", "⥷", "⥺", "⦸", "⦼", "⦾", "⦿", "⧴", "⧶", "⧷", "⧺", "⧻", "⨇", "⨈", "⨝", "⨟", "⨢", "⨣", "⨤", "⨥", "⨦", "⨧", "⨨", "⨩", "⨪", "⨫", "⨬", "⨭", "⨮", "⨰", "⨱", "⨲", "⨳", "⨴", "⨵", "⨶", "⨷", "⨸", "⨹", "⨺", "⨻", "⨼", "⨽", "⩀", "⩁", "⩂", "⩃", "⩄", "⩅", "⩊", "⩋", "⩌", "⩍", "⩎", "⩏", "⩐", "⩑", "⩒", "⩓", "⩔", "⩕", "⩖", "⩗", "⩘", "⩚", "⩛", "⩜", "⩝", "⩞", "⩟", "⩠", "⩡", "⩢", "⩣", "⩴", "⫛", "⬰", "⬱", "⬲", "⬳", "⬴", "⬵", "⬶", "⬷", "⬸", "⬹", "⬺", "⬻", "⬼", "⬽", "⬾", "⬿", "⭀", "⭁", "⭂", "⭃", "⭄", "⭇", "⭈", "⭉", "⭊", "⭋", "⭌", "←", "↑", "→", "↓"]
# Find all files recursively in ../base that end in .jl:
files = let allfiles = []
for dir in ("base", "stdlib")
for (root, dirs, files) in walkdir(dir)
append!(allfiles, collect((file -> joinpath(root, file)).(files)))
end
end
unique!(allfiles)
filter!(file -> endswith(file, ".jl"), allfiles)
String[allfiles...]
end
code = let c=""
for file in files
contents = split(read(file, String), '\n')
# Remove all text after `#`:
contents = join(map(line -> replace(line, r"#.*$" => ""), contents), '\n')
c *= contents
end
c
end
# Count the number of occurrences of each symbol:
counts = let c=Dict{Union{String,Regex}, Int}()
for symbol in symbols
c[symbol] = count(symbol, code)
end
# Now, we walk through the symbols, and subtract
# the counts of any small symbols which are
# substrings of the current symbol:
for s_small in symbols, s_large in symbols
s_small == s_large && continue
s_large isa Regex && continue
if occursin(s_small, s_large)
c[s_small] -= c[s_large]
end
end
c
end
df = let d=DataFrame(symbols=symbols, counts=[counts[symbol] for symbol in symbols])
sort!(d, :counts, rev=true)
d
end
# Print to markdown table:
open("counts.txt", "w") do io
println(io, "| Symbol | Count |")
println(io, "|--------|-------|")
for row in eachrow(df)
println(io, "| $(row[:symbols]) | $(row[:counts]) |")
end
end
I basically just took the julia-parser.scm code and did a search for those symbols, then removed overlaps. Then cleaned up as described above.