collatz grammar compression visualization

grammar_strings this is an attempt to visualize the results of a grammar compression of the first 6K collatz trajectories. not fully sure what it means yet, have to dig into it further. it seems to indicate that there are long “recursively repetitive” strings of up/down sequences that dont deviate very much (ie mean returning). it also seems to exhibit some scale-invariance. probably I need to be working with a large graph visualization software, has anyone heard of any? the decomposition is in the form of a fair-sized DAG. gotta figure out what that guy used in his masters thesis on SAT clause graph analysis lying around here somewhere….

def substr(l)

    c = l.size

    l2 = []

    c.times { |i| l2 << l[i..-1] }
    l2.sort!

    p = []

    (c - 1).times \
    {
        |i|

        n = 0
        while (l2[i][n] == l2[i + 1][n] && n < l2[i].size && n < l2[i + 1].size)
            n += 1
        end
        p[i] = n
    }

    a = (0...p.size).to_a.map { |i| [p[i], i] }

    a = a.sort.reverse

    sz = 0
    m2 = []
    sz2 = 0

    (1...l.size).each \
    {
        |i|

        x = a[i-1][1]
        next if (l2[x].size < sz)
        mx = nil
        m = []
        sz2 = sz
        (x - 1).downto(0).each \
        {
            |j|
            mx = mx.nil? ? p[j] : [p[j], mx].min

            break if (mx == 0 || sz > mx)

            d = (l2[x].size - l2[j].size).abs
            next if (d < sz)

            w = [mx, d].min
            next if (sz > w)

            if (m.empty? || w > sz) then
                m = [c - l2[x].size]
                f = true
            else
                next if (m.find { |z| (c - l2[j].size - z).abs < sz })
            end
            m << c - l2[j].size
#            p(['***', w, m])
            sz = w
        }
        sz2, m2 = [[sz, m], [sz2, m2]].max_by { |x| [x[0], x[1].size] }
    }

    return [sz2, m2.sort]

end

def compress(l)

    sz, m = substr(l)

#    p([sz, m])
#    exit

    return l if (sz <= 1)

    l2 = compress(l[m.first, sz])

    s = $s.dup

    $x[s] = l2

    if ($x2.member?(l2)) then
	s = $x2[l2]
    else
        $x2[l2] = s
        $n += 1
        $s.succ!
    end

    l2 = l.dup

    m.size.times \
    {
        |i|
        l2[m[i], sz] = s
        m = m.map { |j| j - sz + 1 }
    }

    return compress(l2)
end

def expand(l, l2, x)
    l1 = l2.dup

    i = 0
    while (i < l1.size)

        if (!/^[a-z]+$/.match(l1[i]))
            i += 1
            next
        end

        l1[i, 1] = x[l1[i]]
    end
    raise if (l1 != l)
end

def compress2(l)
    $s = 'a'
    $x = {}
    $x2 = {}
    $n = l.max + 1
    l = l.map { |x| x.to_s }
    l2 = compress(l)

    expand(l, l2, $x)
    return l2
end

def f(n)
	l = [n]
	c = 0
	n2 = n
	while (n2 >= n)
		n2 = n2 * 3 + 1
		n2 >>= 1 while (n2.even?)
		l << n2
		c += 1
	end
	return l
end

def f2(n)
	l = []
	c = 0
	n2 = n
	while (n2 > 1)
		n2 = (n2 * 3 + 1) / 2
		l << n2 % 2
		n2 >>= 1 while (n2.even?)
		c += 1
	end
	return l
end

def out()

	l2 = []
	a = {}
	b = {}
	$x.each \
	{
		|x, y|
		s = !y.select { |z| /^[a-z]+$/.match(z) }.empty?

		l = []
		y.each \
		{
			|z, i|
			z2 = [z]
			if (/^[a-z]+$/.match(z)) then
				b[z] = 0 if (!b.member?(z))
				b[z] += 1
				z2 = a[z]
			end
			l.concat(z2)
		}
		a[x] = l
		l2 << l
#		p([x, s, y, l])
	}
#	p(b)

	y = 0
	l2.sort_by { |x| [x.size, x] }.each \
	{
		|z|
		x = 0
		y2 = y
		z.each \
		{
			|b|
			y2 += {"0" => -1, "1" => 1}[b]
			puts([x, y2].join("\t"))
			x += 1
		}
		puts
		y += 20
	}

end

n = 3
c = 0
x2 = 0
l2 = []
loop \
{
	l = f2(n)
	l2 += l
	n += 2
	break if (l2.size >= 6000)
}

l = compress2(l2)
#p(l)
out()

this is some deeper analysis. a big gap in the above code is that it does not track the original corresponding sequences, only the 0/1 sequences. it is a minor code change to add that below. in the following graphs the infrequent larger words are left, the frequent small words are right. the “divergence” of each path can be plotted and stays relatively low for high seed ending points (1st fig). also from this graph the algorithm generally prefers up-biased walks for compression sequences. another basic metric is how many words of each size are in the compression, basically a histogram. the shorter words are more common (2nd fig). its not clear in the figure but most (longer) grammar symbols have only 2 occurrences. another basic question is how much the compression/grammar words cross seed boundaries. this turns out to be quite substantial (3rd fig).

def substr(l)

    c = l.size
    
    l2 = []

    c.times { |i| l2 << l[i..-1] }
    l2.sort!


    p = []

    (c - 1).times \
    {
        |i|

        n = 0
        while (l2[i][n] == l2[i + 1][n] && n < l2[i].size && n < l2[i + 1].size)
            n += 1
        end
        p[i] = n
    }

    a = (0...p.size).to_a.map { |i| [p[i], i] }

    a = a.sort.reverse

    sz = 0
    m2 = []
    sz2 = 0

    (1...l.size).each \
    {
        |i|

        x = a[i-1][1]
        next if (l2[x].size < sz)
        mx = nil
        m = []
        sz2 = sz
        (x - 1).downto(0).each \
        {
            |j|
            mx = mx.nil? ? p[j] : [p[j], mx].min
            
            break if (mx == 0 || sz > mx)
        
            d = (l2[x].size - l2[j].size).abs
            next if (d < sz)
            
            w = [mx, d].min
            next if (sz > w)
            
            if (m.empty? || w > sz) then
                m = [c - l2[x].size]
                f = true
            else
                next if (m.find { |z| (c - l2[j].size - z).abs < sz })
            end
            m << c - l2[j].size
#            p(['***', w, m])
            sz = w
        }
        sz2, m2 = [[sz, m], [sz2, m2]].max_by { |x| [x[0], x[1].size] }
    }

    return [sz2, m2.sort]
    
end

def compress(l, l3)

    sz, m = substr(l)
    
#    p([sz, m])
#    exit
    
    return l if (sz <= 1)

    l2 = compress(l[m.first, sz], l3[m.first, sz])

    s = $s.dup

    $x[s] = l2
    
    if ($x2.member?(l2)) then
	s = $x2[l2]
    else
	$y[s] = []
        $x2[l2] = s
        $n += 1
        $s.succ!
    end
    
    l2 = l.dup
    l4 = l3.dup
    
    m.size.times \
    {
        |i|
        l2[m[i], sz] = s
	$y[s] << (l4[m[i]]..l4[m[i] + sz - 1])
	l4[m[i] + 1, sz - 1] = []
        m = m.map { |j| j - sz + 1 }
    }
    
    return compress(l2, l4)
end

def expand(l, l2, x)
    l1 = l2.dup
    
    i = 0
    while (i < l1.size)
        
        if (!/^[a-z]+$/.match(l1[i]))
            i += 1
            next
        end
        
        l1[i, 1] = x[l1[i]]
    end
    raise if (l1 != l)
end


def compress2(l)
    $s = 'a'
    $x = {}
    $x2 = {}
    $y = {}
    $n = l.max + 1
    l = l.map { |x| x.to_s }
    l2 = compress(l, (0...l.size).to_a)
    
#    expand(l, l2, $x)
    return l2
end

def f2(n)
	l = []
	l2 = []
	c = 0
	n2 = n
	while (n2 >= n)
		n2 = (n2 * 3 + 1) / 2
		l << n2 % 2
		l2 << n2
		n2 >>= 1 while (n2.even?) 
		c += 1
	end
	return l, l2
end		

def out2(l2, l3, a)

	y2 = 0
	$y.sort_by { |x| x[1][0].max - x[1][0].min }.reverse.each \
	{
		|s, l|
		l1 = []
		y = 0
		c = 0
#		l2[l[0]].each \
		l[0].each \
		{ 
			|i|
			c += 1 if (a.member?(i)) 
			x = l2[i]
			y += {0 => -1, 1 => 1}[x]
			l1 << y 
		}
#		l1.each_with_index { |y, x| puts([x, y + y2].join("\t")) }
		y2 += 20
#		puts
		puts([l1.last, l.size, l[0].max - l[0].min, c].join("\t"))
	}

end

n = 3
c = 0
x2 = 0
l2 = []
l3 = []
a = {}
loop \
{
	l, l1 = f2(n)
	l2 += l
	l3 += l1
	a[l2.size - 1] = nil
	n += 4
	break if (l2.size >= 20000)
}

l = compress2(l2)
out2(l2, l3, a)

diverge

wordcount

seedcrossing

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s