Character n-grams in array

I’m trying to get the character n-grams for a list of words. For example:

julia> s = "apple"
julia> s=reduce(vcat, permutedims.(collect.(s)))
['a', 'p', 'p', 'l', 'e']
julia> ngram(s,n) = join([view(s,i:i+n-1) for i=1:length(s)-n+1])
julia> ngram(s,2)
"['a','p'],['p','p'],['p','l'],['l','e']"

If I have a list, let’s say:

julia> s = ["apple"
"orange"
"pear"
"honeycrisp apple"
];

#split_char here is equivalent to 2nd equation in above example
julia> function split_char(s)
    s= collect.(s)
    end
    s
end
julia> split_char(s) 
[['a', 'p', 'p', 'l', 'e'], ['o', 'r', 'a', 'n', 'g', 'e'], ['p', 'e', 'a', 'r'], ['h', 'o', 'n', 'e', 'y', 'c', 'r', 'i', 's', 'p', ' ', 'a', 'p', 'p', 'l', 

Then passing it through this n-gram function is where nothing really happens:

julia> function ngram(s,n)
    s= [view(s[i],i:i+n-1) for i=1:length(s)-n+1]
end

My goal is to get this as a result:

> julia> "['a','p'],['p','p'],['p','l'],['l','e'], ['o','r'],['r','a'],['a','n'],['n','g'],['g','e'], ......]

See this discussion: nlp - Generate ngrams with Julia - Stack Overflow

1 Like

Hey, thanks for the link! I’m actually using the code in the last comment of this thread (also found here).

My goal is to get separate character-level n-grams for each word in a list (i.e. if I have an array of 4 words, then I’m trying to get a 4-element result of character-level n-grams).

julia> function split_char(s)
           s= collect.(s)
       end
split_char (generic function with 1 method)

julia> function ngram(s,n)
           [ [view(s[j],i:i+n-1) for i=1:length(s[j])-n+1] for j in 1:length(s) ]
       end
ngram (generic function with 1 method)

julia> s = ["apple"
       "orange"
       "pear"
       "honeycrisp apple"
       ];

julia> ss=split_char(s)
4-element Array{Array{Char,1},1}:
 ['a', 'p', 'p', 'l', 'e']
 ['o', 'r', 'a', 'n', 'g', 'e']
 ['p', 'e', 'a', 'r']
 ['h', 'o', 'n', 'e', 'y', 'c', 'r', 'i', 's', 'p', ' ', 'a', 'p', 'p', 'l', 'e']

julia> ngrams = ngram(ss,2)
4-element Array{Array{SubArray{Char,1,Array{Char,1},Tuple{UnitRange{Int64}},true},1},1}:
 [['a', 'p'], ['p', 'p'], ['p', 'l'], ['l', 'e']]
 [['o', 'r'], ['r', 'a'], ['a', 'n'], ['n', 'g'], ['g', 'e']]
 [['p', 'e'], ['e', 'a'], ['a', 'r']]
 [['h', 'o'], ['o', 'n'], ['n', 'e'], ['e', 'y'], ['y', 'c'], ['c', 'r'], ['r', 'i'], ['i', 's'], ['s', 'p'], ['p', ' '], [' ', 'a'], ['a', 'p'], ['p', 'p'], ['p', 'l'], ['l', 'e']]

1 Like

Some reason for the s=?

No, just copy&paste from OP :slight_smile:

This is not what you ask for, but I would still like to showcase the loop fusion and substring features.

julia> ngram(s::AbstractString,n) = [SubString(s,i:i+n-1) for i=1:length(s)-n+1]
ngram (generic function with 1 method)

julia> s = ["apple"
       "orange"
       "pear"
       "honeycrisp apple"
       ];

julia> ngram.(s,2)
4-element Array{Array{SubString{String},1},1}:
 ["ap", "pp", "pl", "le"]
 ["or", "ra", "an", "ng", "ge"]
 ["pe", "ea", "ar"]
 ["ho", "on", "ne", "ey", "yc", "cr", "ri", "is", "sp", "p ", " a", "ap", "pp", "pl", "le"]

julia> ngram.(s,3)
4-element Array{Array{SubString{String},1},1}:
 ["app", "ppl", "ple"]
 ["ora", "ran", "ang", "nge"]
 ["pea", "ear"]
 ["hon", "one", "ney", "eyc", "ycr", "cri", "ris", "isp", "sp ", "p a", " ap", "app", "ppl", "ple"]

And if you really want array of characters:

julia> function split_char(s)
           s= collect.(s)
           end
split_char (generic function with 1 method)

julia> bigrams = ngram.(s,2)
4-element Array{Array{SubString{String},1},1}:
 ["ap", "pp", "pl", "le"]
 ["or", "ra", "an", "ng", "ge"]
 ["pe", "ea", "ar"]
 ["ho", "on", "ne", "ey", "yc", "cr", "ri", "is", "sp", "p ", " a", "ap", "pp", "pl", "le"]

julia> split_char.(bigrams)
4-element Array{Array{Array{Char,1},1},1}:
 [['a', 'p'], ['p', 'p'], ['p', 'l'], ['l', 'e']]
 [['o', 'r'], ['r', 'a'], ['a', 'n'], ['n', 'g'], ['g', 'e']]
 [['p', 'e'], ['e', 'a'], ['a', 'r']]
 [['h', 'o'], ['o', 'n'], ['n', 'e'], ['e', 'y'], ['y', 'c'], ['c', 'r'], ['r', 'i'], ['i', 's'], ['s', 'p'], ['p', ' '], [' ', 'a'], ['a', 'p'], ['p', 'p'], ['p', 'l'], ['l', 'e']]
3 Likes