Language Modeling with DEFLATE

Can you generate language using compression - selecting the next token by what compresses the most? Can a high-quality starting corpus influence the quality of the generation?

<corpus, input text, [ generation sequence ]>?

Fetch from Project Gutenberg

I used the plaintext Fetch Darwin’s text from https://www.gutenberg.org/cache/epub/1228/pg1228.txt


How well does it compress?

digraph NeuralNetwork {
  rankdir=LR
  color=darkgray
  node [shape=circle, style=filled, fillcolor=blue, fixedsize=true, width=0.8]
  edge [color=gray50]
  
  subgraph cluster_input {
    label="Input Layer"
    style=dotted
    color=gray
    x1 [label="x₁"]
    x2 [label="x₂"]
    x3 [label="x₃"]
  }
  
  subgraph cluster_hidden1 {
    label="Hidden Layer 1"
    style=dotted
    color=gray
    h11 [fillcolor=green, label="h₁¹"]
    h12 [fillcolor=green, label="h₂¹"]
    h13 [fillcolor=green, label="h₃¹"]
    h14 [fillcolor=green, label="h₄¹"]
  }
  
  subgraph cluster_hidden2 {
    label="Hidden Layer 2"
    style=dotted
    color=gray
    h21 [fillcolor=yellow, label="h₁²"]
    h22 [fillcolor=yellow, label="h₂²"]
    h23 [fillcolor=yellow, label="h₃²"]
  }
  
  subgraph cluster_output {
    label="Output Layer"
    style=dotted
    color=gray
    y1 [fillcolor=coral, label="y₁"]
    y2 [fillcolor=coral, label="y₂"]
  }
  
  {x1, x2, x3} -> {h11, h12, h13, h14} [arrowsize=0.5]
  {h11, h12, h13, h14} -> {h21, h22, h23} [arrowsize=0.5]
  {h21, h22, h23} -> {y1, y2} [arrowsize=0.5]
}

digraph TokenSelection {
  rankdir=LR
  node [shape=circle, style=filled]
  edge [fontsize=10]
  
  start [shape=doublecircle, fillcolor=green, label="START"]
  
  context [fillcolor=blue, label="Build\nContext"]
  generate [fillcolor=yellow, label="Generate\nCandidates"]
  compress [fillcolor=coral, label="Test\nCompression"]
  select [shape=diamond, fillcolor=plum, label="Best\nRatio?"]
  append [fillcolor=green, label="Append\nToken"]
  done [shape=doublecircle, fillcolor=gray, label="END"]
  
  start -> context [label="Initialize"]
  context -> generate [label="Ready"]
  generate -> compress [label="For each\ncandidate"]
  compress -> select
  select -> compress [label="No\n(try next)", style=dashed]
  select -> append [label="Yes"]
  append -> context [label="Continue", style=bold]
  append -> done [label="Stop token\nor limit", color=red]
  
  // Annotations
  note1 [shape=note, fillcolor=yellow, label="Context window\ntypically 512-2048\ntokens"]
  note2 [shape=note, fillcolor=yellow, label="Compression ratio\ndetermines quality"]
  
  context -> note1 [style=dotted, arrowhead=none]
  compress -> note2 [style=dotted, arrowhead=none]
}