# collatz grammar compression visualization

this is an attempt to visualize the results of a grammar compression of the first 6K collatz trajectories. not fully sure what it means yet, have to dig into it further. it seems to indicate that there are long “recursively repetitive” strings of up/down sequences that dont deviate very much (ie mean returning). it also seems to exhibit some scale-invariance. probably I need to be working with a large graph visualization software, has anyone heard of any? the decomposition is in the form of a fair-sized DAG. gotta figure out what that guy used in his masters thesis on SAT clause graph analysis lying around here somewhere….

```def substr(l)

c = l.size

l2 = []

c.times { |i| l2 << l[i..-1] }
l2.sort!

p = []

(c - 1).times \
{
|i|

n = 0
while (l2[i][n] == l2[i + 1][n] && n < l2[i].size && n < l2[i + 1].size)
n += 1
end
p[i] = n
}

a = (0...p.size).to_a.map { |i| [p[i], i] }

a = a.sort.reverse

sz = 0
m2 = []
sz2 = 0

(1...l.size).each \
{
|i|

x = a[i-1][1]
next if (l2[x].size < sz)
mx = nil
m = []
sz2 = sz
(x - 1).downto(0).each \
{
|j|
mx = mx.nil? ? p[j] : [p[j], mx].min

break if (mx == 0 || sz > mx)

d = (l2[x].size - l2[j].size).abs
next if (d < sz)

w = [mx, d].min
next if (sz > w)

if (m.empty? || w > sz) then
m = [c - l2[x].size]
f = true
else
next if (m.find { |z| (c - l2[j].size - z).abs < sz })
end
m << c - l2[j].size
#            p(['***', w, m])
sz = w
}
sz2, m2 = [[sz, m], [sz2, m2]].max_by { |x| [x[0], x[1].size] }
}

return [sz2, m2.sort]

end

def compress(l)

sz, m = substr(l)

#    p([sz, m])
#    exit

return l if (sz <= 1)

l2 = compress(l[m.first, sz])

s = \$s.dup

\$x[s] = l2

if (\$x2.member?(l2)) then
s = \$x2[l2]
else
\$x2[l2] = s
\$n += 1
\$s.succ!
end

l2 = l.dup

m.size.times \
{
|i|
l2[m[i], sz] = s
m = m.map { |j| j - sz + 1 }
}

return compress(l2)
end

def expand(l, l2, x)
l1 = l2.dup

i = 0
while (i < l1.size)

if (!/^[a-z]+\$/.match(l1[i]))
i += 1
next
end

l1[i, 1] = x[l1[i]]
end
raise if (l1 != l)
end

def compress2(l)
\$s = 'a'
\$x = {}
\$x2 = {}
\$n = l.max + 1
l = l.map { |x| x.to_s }
l2 = compress(l)

expand(l, l2, \$x)
return l2
end

def f(n)
l = [n]
c = 0
n2 = n
while (n2 >= n)
n2 = n2 * 3 + 1
n2 >>= 1 while (n2.even?)
l << n2
c += 1
end
return l
end

def f2(n)
l = []
c = 0
n2 = n
while (n2 > 1)
n2 = (n2 * 3 + 1) / 2
l << n2 % 2
n2 >>= 1 while (n2.even?)
c += 1
end
return l
end

def out()

l2 = []
a = {}
b = {}
\$x.each \
{
|x, y|
s = !y.select { |z| /^[a-z]+\$/.match(z) }.empty?

l = []
y.each \
{
|z, i|
z2 = [z]
if (/^[a-z]+\$/.match(z)) then
b[z] = 0 if (!b.member?(z))
b[z] += 1
z2 = a[z]
end
l.concat(z2)
}
a[x] = l
l2 << l
#		p([x, s, y, l])
}
#	p(b)

y = 0
l2.sort_by { |x| [x.size, x] }.each \
{
|z|
x = 0
y2 = y
z.each \
{
|b|
y2 += {"0" => -1, "1" => 1}[b]
puts([x, y2].join("\t"))
x += 1
}
puts
y += 20
}

end

n = 3
c = 0
x2 = 0
l2 = []
loop \
{
l = f2(n)
l2 += l
n += 2
break if (l2.size >= 6000)
}

l = compress2(l2)
#p(l)
out()
```

this is some deeper analysis. a big gap in the above code is that it does not track the original corresponding sequences, only the 0/1 sequences. it is a minor code change to add that below. in the following graphs the infrequent larger words are left, the frequent small words are right. the “divergence” of each path can be plotted and stays relatively low for high seed ending points (1st fig). also from this graph the algorithm generally prefers up-biased walks for compression sequences. another basic metric is how many words of each size are in the compression, basically a histogram. the shorter words are more common (2nd fig). its not clear in the figure but most (longer) grammar symbols have only 2 occurrences. another basic question is how much the compression/grammar words cross seed boundaries. this turns out to be quite substantial (3rd fig).

```def substr(l)

c = l.size

l2 = []

c.times { |i| l2 << l[i..-1] }
l2.sort!

p = []

(c - 1).times \
{
|i|

n = 0
while (l2[i][n] == l2[i + 1][n] && n < l2[i].size && n < l2[i + 1].size)
n += 1
end
p[i] = n
}

a = (0...p.size).to_a.map { |i| [p[i], i] }

a = a.sort.reverse

sz = 0
m2 = []
sz2 = 0

(1...l.size).each \
{
|i|

x = a[i-1][1]
next if (l2[x].size < sz)
mx = nil
m = []
sz2 = sz
(x - 1).downto(0).each \
{
|j|
mx = mx.nil? ? p[j] : [p[j], mx].min

break if (mx == 0 || sz > mx)

d = (l2[x].size - l2[j].size).abs
next if (d < sz)

w = [mx, d].min
next if (sz > w)

if (m.empty? || w > sz) then
m = [c - l2[x].size]
f = true
else
next if (m.find { |z| (c - l2[j].size - z).abs < sz })
end
m << c - l2[j].size
#            p(['***', w, m])
sz = w
}
sz2, m2 = [[sz, m], [sz2, m2]].max_by { |x| [x[0], x[1].size] }
}

return [sz2, m2.sort]

end

def compress(l, l3)

sz, m = substr(l)

#    p([sz, m])
#    exit

return l if (sz <= 1)

l2 = compress(l[m.first, sz], l3[m.first, sz])

s = \$s.dup

\$x[s] = l2

if (\$x2.member?(l2)) then
s = \$x2[l2]
else
\$y[s] = []
\$x2[l2] = s
\$n += 1
\$s.succ!
end

l2 = l.dup
l4 = l3.dup

m.size.times \
{
|i|
l2[m[i], sz] = s
\$y[s] << (l4[m[i]]..l4[m[i] + sz - 1])
l4[m[i] + 1, sz - 1] = []
m = m.map { |j| j - sz + 1 }
}

return compress(l2, l4)
end

def expand(l, l2, x)
l1 = l2.dup

i = 0
while (i < l1.size)

if (!/^[a-z]+\$/.match(l1[i]))
i += 1
next
end

l1[i, 1] = x[l1[i]]
end
raise if (l1 != l)
end

def compress2(l)
\$s = 'a'
\$x = {}
\$x2 = {}
\$y = {}
\$n = l.max + 1
l = l.map { |x| x.to_s }
l2 = compress(l, (0...l.size).to_a)

#    expand(l, l2, \$x)
return l2
end

def f2(n)
l = []
l2 = []
c = 0
n2 = n
while (n2 >= n)
n2 = (n2 * 3 + 1) / 2
l << n2 % 2
l2 << n2
n2 >>= 1 while (n2.even?)
c += 1
end
return l, l2
end

def out2(l2, l3, a)

y2 = 0
\$y.sort_by { |x| x[1][0].max - x[1][0].min }.reverse.each \
{
|s, l|
l1 = []
y = 0
c = 0
#		l2[l[0]].each \
l[0].each \
{
|i|
c += 1 if (a.member?(i))
x = l2[i]
y += {0 => -1, 1 => 1}[x]
l1 << y
}
#		l1.each_with_index { |y, x| puts([x, y + y2].join("\t")) }
y2 += 20
#		puts
puts([l1.last, l.size, l[0].max - l[0].min, c].join("\t"))
}

end

n = 3
c = 0
x2 = 0
l2 = []
l3 = []
a = {}
loop \
{
l, l1 = f2(n)
l2 += l
l3 += l1
a[l2.size - 1] = nil
n += 4
break if (l2.size >= 20000)
}

l = compress2(l2)
out2(l2, l3, a)
```