Skip to content

Instantly share code, notes, and snippets.

@mjf
Created November 20, 2025 13:08
Show Gist options
  • Select an option

  • Save mjf/396d71dda5f968022cf4ad6b317bc086 to your computer and use it in GitHub Desktop.

Select an option

Save mjf/396d71dda5f968022cf4ad6b317bc086 to your computer and use it in GitHub Desktop.
Word clustering for SilverBullet implemented in Space Lua

#meta

simfn Class

Description

Implements the most common similarity algorithms:

  • Common Prefix Length,
  • Common Suffix Length,
  • Levenshtein Distance,
  • Hamming Distance,
  • Longest Common Subsequence,
  • Cosine Similarity,
  • Camberra Distance and
  • Jaro-Winkler Distance.

Implementation

simfn         = simfn or {}
simfn.__index = simfn

-- Common Prefix Length
function simfn.cpl(s1, s2)
  local l1, l2 = #s1, #s2
  local lmax = l1 < l2 and l1 or l2

  for i = 1, lmax do
    if s1:byte(i) ~= s2:byte(i) then
      return i - 1
    end
  end

  return lmax
end

-- Common Suffix Length
function simfn.csl(s1, s2)
  local l1, l2 = #s1, #s2
  local lmax = l1 < l2 and l1 or l2

  for i = 1, lmax do
    if s1:byte(l1 - i + 1) ~= s2:byte(l2 - i + 1) then
      return i - 1
    end
  end

  return lmax
end

-- Levenshtein Distance
function simfn.ld(s1, s2)
  local l1, l2 = #s1, #s2

  if l1 == 0 then return l2 end
  if l2 == 0 then return l1 end

  local m = {} -- matrix

  for i = 0, l1 do
    m[i]    = {}
    m[i][0] = i
  end

  for j = 0, l2 do
    m[0][j] = j
  end

  for i = 1, l1 do
    for j = 1, l2 do
      local cost = s1:byte(i) == s2:byte(j) and 0 or 1

      m[i][j] = math.min(
        m[i - 1][j    ] + 1,
        m[i    ][j - 1] + 1,
        m[i - 1][j - 1] + cost)
    end
  end

  return m[l1][l2]
end

-- Hamming Distance
function simfn.hd(s1, s2)
  local l1, l2 = #s1, #s2

  if l1 ~= l2 then
    return 1/0 -- positive infinity (math.huge)
  end

  local dist = 0

  for i = 1, l1 do
    if s1:byte(i) ~= s2:byte(i) then
      dist = dist + 1
    end
  end

  return dist
end

-- Longest Common Subsequence
function simfn.lcs(s1, s2)
  local l1, l2 = #s1, #s2
  local m      = {} -- matrix

  for i = 0, l1 do
    m[i]    = {}
    m[i][0] = 0
  end

  for j = 0, l2 do
    m[0][j] = 0
  end

  for i = 1, l1 do
    for j = 1, l2 do
      if s1:byte(i) == s2:byte(j) then
        m[i][j] = m[i - 1][j - 1] + 1
      else
        m[i][j] = math.max(m[i - 1][j], m[i][j - 1])
      end
    end
  end

  return m[l1][l2]
end

-- Cosine Similarity
function simfn.cs(s1, s2)
  local function cf(s)
    local f = {}

    for i = 1, #s do
      local c = s:byte(i)

      f[c] = (f[c] or 0) + 1
    end

    return f
  end

  local f1 = cf(s1)
  local f2 = cf(s2)
  local cs = {}

  for c in pairs(f1) do
    cs[c] = true
  end

  for c in pairs(f2) do
    cs[c] = true
  end

  local dp, nm1, nm2 = 0, 0, 0

  for c in pairs(cs) do
    local f1 = f1[c] or 0
    local f2 = f2[c] or 0

    dp  = dp  + f1*f2
    nm1 = nm1 + f1*f1
    nm2 = nm2 + f2*f2
  end

  local mag = math.sqrt(nm1)*math.sqrt(nm2)

  if mag == 0 then
    return 0
  end

  return dp/mag
end

-- Camberra Distance
function simfn.cd(s1, s2)
  local function cf(s)
    local f = {}

    for i = 1, #s do
      local c = s:byte(i)

      f[c] = (f[c] or 0) + 1
    end

    return f
  end

  local f1 = cf(s1)
  local f2 = cf(s2)
  local cs = {}

  for c in pairs(f1) do
    cs[c] = true
  end

  for c in pairs(f2) do
    cs[c] = true
  end

  local sum = 0

  for c in pairs(cs) do
    local fr1 = f1[c] or 0
    local fr2 = f2[c] or 0
    local num = math.abs(fr1 - fr2)
    local den = fr1 + fr2

    if den > 0 then
      sum = sum + num/den
    end
  end

  return sum
end

-- Jaro-Winkler Distance
function simfn.jwd(s1, s2)
  local l1, l2 = #s1, #s2

  if l1 == 0 and l2 == 0 then
    return 1.0
  end

  if l1 == 0 or l2 == 0 then
    return 0.0
  end

  local md = math.max(l1, l2)//2 - 1

  if md < 0 then
    md = 0
  end

  local s1m = {}
  local s2m = {}
  local mc  = 0

  for i = 1, l1 do
    local start  = math.max(1, i - md)
    local finish = math.min(i + md, l2)

    for j = start, finish do
      if not s2m[j] and s1:byte(i) == s2:byte(j) then
        s1m[i] = true
        s2m[j] = true
        mc     = mc + 1
        break
      end
    end
  end

  if mc == 0 then
    return 0.0
  end

  local tps = 0
  local k = 1

  for i = 1, l1 do
    if s1m[i] then
      while not s2m[k] do
        k = k + 1
      end

      if s1:byte(i) ~= s2:byte(k) then
        tps = tps + 1
      end

      k = k + 1
    end
  end

  local jaro = (mc/l1 + mc/l2 + (mc - tps/2)/mc)/3

  if jaro < 0.7 then
    return jaro
  end

  local plen = 0

  for i = 1, math.min(l1, l2, 4) do
    if s1:byte(i) == s2:byte(i) then
      plen = plen + 1
    else
      break
    end
  end

  return jaro + 0.1*plen*(1 - jaro)
end

words Class

Description

  • Implements words clustering.
  • Intended to be used together with the simfn class methods.

Implementation

words         = words or {}
words.__index = words

local function cluster_by_sim(words, thresh, sim_fn, cmp_fn)
  local n  = #words
  local u  = {}
  local gs = {}
  local gc = 0

  cmp_fn = cmp_fn or function(sim, thresh)
    return sim >= thresh
  end

  for i = 1, n do
    if not u[i] then
      local g   = {words[i]}
      local gsz = 1

      for j = i + 1, n do
        if not u[j] then
          local sim = sim_fn(words[i], words[j])

          if cmp_fn(sim, thresh) then
            gsz    = gsz + 1
            g[gsz] = words[j]
            u[j]   = true
          end
        end
      end

      if gsz > 1 then
        gc     = gc + 1
        gs[gc] = g
        u[i]   = true
      end
    end
  end

  return gs
end

function words.cluster_by_pfx(words, min_len)
  return cluster_by_sim(words, min_len, simfn.cpl)
end

function words.cluster_by_sfx(words, min_len)
  return cluster_by_sim(words, min_len, simfn.csl)
end

function words.cluster_by_ld(words, max_dist)
  return cluster_by_sim(words, max_dist, simfn.ld,
    function(dist, max)
      return dist <= max
    end)
end

function words.cluster_by_hd(words, max_dist)
  return cluster_by_sim(words, max_dist, simfn.hd,
    function(dist, max)
      return dist <= max
    end)
end

function words.cluster_by_lcs(words, min_len)
  return cluster_by_sim(words, min_len, simfn.lcs)
end

function words.cluster_by_cs(words, min_sim)
  return cluster_by_sim(words, min_sim, simfn.cs)
end

function words.cluster_by_cd(words, max_dist)
  return cluster_by_sim(words, max_dist, simfn.cd,
    function(dist, max)
      return dist <= max
    end)
end

function words.cluster_by_jwd(words, min_sim)
  return cluster_by_sim(words, min_sim, simfn.jwd)
end

-- Custom classification
function words.cluster_by_cls(words, classes)
  classes = classes or {}

  local n  = #words
  local u  = {}
  local cl = {}

  for cn, patterns in pairs(classes) do
    for i = 1, n do
      if not u[i] then
        local word = words[i]

        for j = 1, #patterns do
          if string.match(word, patterns[j]) then
            cl[#cl + 1] = { class = cn, word = word }
            u[i] = true
            break
          end
        end
      end
    end
  end

  local gs = {}
  local max_per_class = {}

  for i = 1, #cl do
    local cn = cl[i].class

    max_per_class[cn] = (max_per_class[cn] or 0) + 1
  end

  for i = 1, #cl do
    local cn = cl[i].class
    local word = cl[i].word

    for j = 1, max_per_class[cn] do
      gs[j] = gs[j] or {}
      if not gs[j][cn] then
        gs[j][cn] = word
        break
      end
    end
  end

  return gs
end

Examples

Cluster Tags by Prefix of Three Letters

${words.cluster_by_pfx(query[[
  from
    index.tag 'tag'
  select
    '#' .. name
  ]],
  3
)}

Result:

${words.cluster_by_pfx(query[[ from index.tag 'tag' select '#' .. name ]], 3 )}

Cluster Tags by Jaro-Winkler Distance of 0.8

${words.cluster_by_jwd(query[[
  from
    index.tag 'tag'
  select
    '#' .. name
  ]],
  0.8
)}

Result:

${words.cluster_by_jwd(query[[ from index.tag 'tag' select '#' .. name ]], 0.8 )}

Cluster Tags by Custom Classes

${words.cluster_by_cls(query[[
  from
    index.tag 'tag'
  select
    '#' .. name
  ]],
  -- array of arrays of regular expressions
  {
    {'meta/.*'},
    {'TODO', 'BUG', 'PR[0-9]*', 'ISSUE[0-9]*'},
    {'home', 'personal', 'family'}
  }
)}

Result:

${words.cluster_by_cls(query[[ from index.tag 'tag' select '#' .. name ]], -- array of arrays of regular expressions { {'meta/.'}, {'TODO', 'BUG', 'PR[0-9]', 'ISSUE[0-9]*'}, {'home', 'personal', 'family'} } )}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment