diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 24b68d7..f0a1945 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -82,6 +82,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1.o rangecoder.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1.o rangecoder.o
+OBJS-$(CONFIG_FFV2_DECODER)            += ffv2.o rangecoder.o
+OBJS-$(CONFIG_FFV2_ENCODER)            += ffv2.o rangecoder.o
 OBJS-$(CONFIG_FFVHUFF_DECODER)         += huffyuv.o
 OBJS-$(CONFIG_FFVHUFF_ENCODER)         += huffyuv.o
 OBJS-$(CONFIG_FLAC_DECODER)            += flacdec.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 0cb0e6d..3a1cde0 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -77,6 +77,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER (EIGHTSVX_FIB, eightsvx_fib);
     REGISTER_DECODER (ESCAPE124, escape124);
     REGISTER_ENCDEC  (FFV1, ffv1);
+    REGISTER_ENCDEC  (FFV2, ffv2);
     REGISTER_ENCDEC  (FFVHUFF, ffvhuff);
     REGISTER_ENCDEC  (FLASHSV, flashsv);
     REGISTER_DECODER (FLIC, flic);
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 39dabeb..b6df475 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -97,6 +97,7 @@ enum CodecID {
     CODEC_ID_ASV1,
     CODEC_ID_ASV2,
     CODEC_ID_FFV1,
+    CODEC_ID_FFV2, // FIXME should go at the end, but that would force manual merges
     CODEC_ID_4XM,
     CODEC_ID_VCR1,
     CODEC_ID_CLJR,
diff --git a/libavcodec/ffv2.c b/libavcodec/ffv2.c
new file mode 100644
index 0000000..ab87e3d
--- /dev/null
+++ b/libavcodec/ffv2.c
@@ -0,0 +1,1314 @@
+/*
+ * Copyright (C) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * Coder types (in octal):
+ * 01 = order 0 entropy. directly comparable to ffvh.
+ * 02 = sample pairs. this is done as a speed optimization in ffvh, but if it's in the standard then it improves compression too.
+ * 03 = zero run. works almost as well as the trees in some anime, but fails in not-so-compressible content.
+ * 10 = 2x2 block of zero flags. escape code for nonzero coefs.
+ * 11 = 2x2 block of [-1,1] values. escape code for bigger coefs.
+ * 12 = 2x2 block of [-2,2] values. escape code for bigger coefs.
+ * 13 = 2x2 block of [-3,3] values. escape code for bigger coefs.
+ * 14 = 2x2 block of log2 values. uncompressed signs and mantissas.
+ * 2X = 8x8, 4x4 quadtree cbp followed by one of the above block coders.
+ * 3X = 8x4 cbp followed by one of the above block coders.
+ */
+
+#define DEBUG
+#include "avcodec.h"
+#include "dsputil.h"
+#include "mathops.h"
+#include "mpegvideo.h"
+
+#define VLC_BITS 11
+#define NUM_TABLES 5
+#define NUM_VLCS 4096
+
+typedef struct {
+    VLC vlc;
+    uint8_t len[NUM_VLCS];
+    uint32_t bits[NUM_VLCS];
+    uint32_t stats[NUM_VLCS];
+} VLCS;
+
+typedef struct FFV2Context FFV2Context;
+struct FFV2Context {
+    AVCodecContext *avctx;
+    DSPContext dsp;
+    GetBitContext gb;
+    PutBitContext pb;
+    void (*read_line)(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width);
+    void (*write_line)(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width);
+    void (*read_block)(FFV2Context *s, VLCS *vlcs, uint8_t *residual);
+    void (*write_block)(FFV2Context *s, VLCS *vlcs, uint8_t *residual);
+    uint8_t *bs, *bs_end;
+    VLCS vlcs[6][NUM_TABLES]; ///< [6] = (Y/U/V)*2 + (I/P)
+    AVFrame fenc;
+    AVFrame fref;
+    int mb_width, mb_height;
+    int tstride;
+    uint8_t *temp[4];
+    uint8_t *mb_types; ///< intra: 0, inter: 1
+    int16_t (*mvs)[2]; ///< current row
+    int16_t (*mvs_top)[2]; ///< previous row
+    int16_t (*mvs_base)[2]; ///< buffer that the other mv arrays point into
+    int16_t (*mv_plane[2])[2]; ///< mvs chosen by motion est, not necessarily those that will be coded
+    int16_t (*mv_plane_base)[2];
+    int gop;
+    int initted_vlc[2];
+    int coder_type, coder_tree, coder_block;
+};
+
+static uint16_t map_coder6[1296];
+
+static void common_init(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    int i, j, k, l;
+    memset(s, 0, sizeof(FFV2Context));
+    assert(!(avctx->flags&CODEC_FLAG_EMU_EDGE));
+    dsputil_init(&s->dsp, avctx);
+    s->avctx = avctx;
+    s->mb_width = (avctx->width+7)/8;
+    s->mb_height= (avctx->height+7)/8;
+    avctx->get_buffer(avctx, &s->fenc);
+    avctx->get_buffer(avctx, &s->fref);
+    s->tstride = s->fenc.linesize[0];
+    assert(s->tstride >= ((avctx->width+15)&~7));
+    s->temp[0] = av_malloc(s->tstride*9+16);
+    s->temp[1] = av_malloc(s->tstride*9+16);
+    s->temp[2] = av_malloc(s->tstride*9+16);
+    s->temp[3] = av_malloc(s->tstride*9+16);
+    s->mb_types = av_malloc(s->mb_width);
+    s->mvs_base = av_mallocz(4*(s->mb_width+2)*sizeof(int16_t));
+    s->mvs = s->mvs_base+1;
+    s->mvs_top = s->mvs+s->mb_width+2;
+    for(i=0; i<6; i++)
+        for(j=0; j<6; j++)
+            for(k=0; k<6; k++)
+                for(l=0; l<6; l++)
+                    map_coder6[i*6*6*6+j*6*6+k*6+l] = (i<<9) + (j<<6) + (k<<3) + l;
+}
+
+static int decode_init(AVCodecContext *avctx) {
+    avctx->pix_fmt= PIX_FMT_YUV420P;
+    common_init(avctx);
+    return 0;
+}
+
+static int encode_init(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    int i, j;
+    common_init(avctx);
+    s->coder_type = avctx->coder_type;
+    if(s->coder_type == 0)
+        s->coder_type = 26;
+    s->coder_tree = s->coder_type >> 3;
+    s->coder_block = s->coder_type & 7;
+    if((s->coder_type > 3 && s->coder_type < 8) || s->coder_tree < 0 || s->coder_tree > 3 || s->coder_block > 4) {
+        av_log(avctx, AV_LOG_ERROR, "bad coder_type\n");
+        return -1;
+    }
+    s->mv_plane_base = av_mallocz(4*(s->mb_height+2)*(s->mb_width+2)*sizeof(int16_t));
+    s->mv_plane[0] = s->mv_plane_base+s->mb_width+3;
+    s->mv_plane[1] = s->mv_plane[0]+(s->mb_width+2)*(s->mb_height+2);
+    for(j=0; j<6*NUM_TABLES; j++)
+        for(i=0; i<NUM_VLCS; i++)
+            s->vlcs[0][j].stats[i] = 1;
+    return 0;
+}
+
+static void common_end(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    avctx->release_buffer(avctx, &s->fenc);
+    avctx->release_buffer(avctx, &s->fref);
+    av_free(s->temp[0]);
+    av_free(s->temp[1]);
+    av_free(s->temp[2]);
+    av_free(s->temp[3]);
+    av_free(s->mb_types);
+    av_free(s->mvs_base);
+}
+
+static int decode_end(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    int i;
+    for(i=0; i<6*NUM_TABLES; i++)
+        free_vlc(&s->vlcs[0][i].vlc);
+    common_end(avctx);
+    return 0;
+}
+
+static int encode_end(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    common_end(avctx);
+    av_free(s->mv_plane_base);
+    return 0;
+}
+
+// FIXME code duplication from huffyuv.c
+// FIXME port optimizations back
+static int generate_bits_table(uint32_t *dst, uint8_t *len_table, int size){
+    int len, index;
+    uint32_t bits=1;
+
+    for(len=1; len<32 && ~bits; len++){
+        for(index=size-1; index>=0; index--){
+            if(len_table[index] == len)
+                dst[index] = bits--;
+        }
+        bits = bits*2+1;
+    }
+    if(~bits){
+        av_log(NULL, AV_LOG_ERROR, "Error generating huffman table\n");
+        return -1;
+    }
+    return 0;
+}
+
+typedef struct {
+    uint64_t val;
+    int name;
+} HeapElem;
+
+static void heap_sift(HeapElem *h, int root, int size)
+{
+    while(root*2+1 < size) {
+        int child = root*2+1;
+        if(child < size-1 && h[child].val > h[child+1].val)
+            child++;
+        if(h[root].val > h[child].val) {
+            FFSWAP(HeapElem, h[root], h[child]);
+            root = child;
+        } else
+            break;
+    }
+}
+
+static void generate_len_table(uint8_t *dst, uint32_t *stats, int size){
+    HeapElem h[size];
+    int up[2*size];
+    int len[2*size];
+    int offset, i, next;
+
+    for(offset=1; ; offset<<=1){
+        for(i=0; i<size; i++){
+            h[i].name = i;
+            h[i].val = ((uint64_t)stats[i] << 8) + offset;
+        }
+        for(i=size/2-1; i>=0; i--)
+            heap_sift(h, i, size);
+
+        for(next=size; next<size*2-1; next++){
+            // merge the two smallest entries, and put it back in the heap
+            uint64_t min1v = h[0].val;
+            up[h[0].name] = next;
+            h[0].val = INT64_MAX;
+            heap_sift(h, 0, size);
+            up[h[0].name] = next;
+            h[0].name = next;
+            h[0].val += min1v;
+            heap_sift(h, 0, size);
+        }
+
+        len[2*size-2] = 0;
+        for(i=2*size-3; i>=size; i--)
+            len[i] = len[up[i]] + 1;
+        for(i=0; i<size; i++) {
+            dst[i] = len[up[i]] + 1;
+            if(dst[i] >= 32) break;
+        }
+        if(i==size) break;
+    }
+}
+
+// TODO: simply vlcing the len table reduces table size by about 1.7x
+// reordering and gradient prediction should help further.
+
+static int read_len_table(uint8_t *len, GetBitContext *gb, int size){
+    int i, val, repeat=-1;
+
+    for(i=0; i<size;){
+        repeat= get_bits(gb, 3);
+        val   = get_bits(gb, 5);
+        if(repeat==0)
+            repeat = get_bits(gb, 8);
+        while(repeat-- && i<size)
+            len[i++] = val;
+    }
+    if(repeat != -1) {
+        av_log(NULL, AV_LOG_ERROR, "Error reading huffman table\n");
+        return -1;
+    }
+    return 0;
+}
+
+static int write_len_table(uint8_t *len, uint8_t *buf, int size){
+    int i;
+    int index= 0;
+
+    for(i=0; i<size;){
+        int val= len[i];
+        int repeat=0;
+
+        for(; i<size && len[i]==val && repeat<255; i++)
+            repeat++;
+
+        assert(val<32 && val>0 && repeat<256 && repeat>0);
+        if(repeat>7){
+            buf[index++]= val;
+            buf[index++]= repeat;
+        }else{
+            buf[index++]= val | (repeat<<5);
+        }
+    }
+
+    return index;
+}
+
+static int read_table(FFV2Context *s, VLCS* vlcs, int size, uint16_t *map) {
+    if(read_len_table(vlcs->len, &s->gb, size))
+        return -1;
+    if(generate_bits_table(vlcs->bits, vlcs->len, size))
+        return -1;
+    free_vlc(&vlcs->vlc);
+    init_vlc_sparse(&vlcs->vlc, VLC_BITS, size, vlcs->len, 1, 1, vlcs->bits, 4, 4, map, 2, 2, 0);
+    return 0;
+}
+
+static void write_table(FFV2Context *s, VLCS* vlcs, int size) {
+    int i;
+    generate_len_table(vlcs->len, vlcs->stats, size);
+    generate_bits_table(vlcs->bits, vlcs->len, size);
+    s->bs += write_len_table(vlcs->len, s->bs, size);
+    for(i=0; i<size; i++)
+        vlcs->stats[i] = (vlcs->stats[i]+1)>>1;
+}
+
+// ----------
+
+static void read_line_v0(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width) {
+    int x;
+    for(x=0; x<width; x++)
+        residual[x] = get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+}
+
+static void write_line_v0(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width) {
+    int x;
+    for(x=0; x<width; x++) {
+        int v = residual[x];
+        put_bits(&s->pb, vlcs[0].len[v], vlcs[0].bits[v]);
+        vlcs[0].stats[v]++;
+    }
+}
+
+// ----------
+
+static void read_line_v1(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width) {
+    int x;
+    for(x=0; x<width;) {
+        int a = get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+        if(a < 256) {
+            residual[x++] = a;
+            if((unsigned)(a - 8) >= 240 && x < width)
+                residual[x++] = 8 + get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+        } else {
+            residual[x++] = (int8_t)a >> 4;
+            residual[x++] = (int8_t)(a<<4) >> 4;
+        }
+    }
+}
+
+static void write_line_v1(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width) {
+    int x;
+    for(x=0; x<width;) {
+        int a = residual[x];
+        int b = residual[x+1];
+        if((unsigned)(a - 8) < 240 || x == width-1) {
+            put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]);
+            vlcs[0].stats[a]++;
+            x++;
+        } else if((unsigned)(b - 8) < 240) {
+            b -= 8;
+            put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]);
+            put_bits(&s->pb, vlcs[1].len[b], vlcs[1].bits[b]);
+            vlcs[0].stats[a]++;
+            vlcs[1].stats[b]++;
+            x+=2;
+        } else {
+            int c = 256 + ((a&15)<<4) + (b&15);
+            put_bits(&s->pb, vlcs[0].len[c], vlcs[0].bits[c]);
+            vlcs[0].stats[c]++;
+            x+=2;
+        }
+    }
+}
+
+// ----------
+
+static void read_line_v2(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width) {
+    int x, v, rep;
+    for(x=0; x<width;) {
+        v = get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+        if(v >= 240) {
+            residual[x++] = v-232;
+        } else {
+            rep = v&15;
+            while(--rep >= 0)
+                residual[x++] = 0;
+            residual[x++] = (v>>4)-7;
+            if(v >= (7<<4) && v < (7<<4)+15 && x<width)
+                residual[x++] = 8 + get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+        }
+    }
+}
+
+static void write_line_v2(FFV2Context *s, VLCS *vlcs, uint8_t *residual, int width) {
+    int x, a, b, rep;
+    residual[width] = 128;
+    for(x=0; x<width;) {
+        for(rep=0; residual[x+rep]==0 && rep<15; rep++);
+        b = (uint8_t)(residual[x+rep] + 7);
+        if(b < 15)
+            a = (b<<4) + rep;
+        else if(rep)
+            a = (7<<4) + rep-1;
+        else
+            a = b + 225;
+        put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]);
+        vlcs[0].stats[a]++;
+        if(a >= (7<<4) && a < (7<<4)+15 && x+rep<width) {
+            b -= 15;
+            put_bits(&s->pb, vlcs[1].len[b], vlcs[1].bits[b]);
+            vlcs[1].stats[b]++;
+        }
+        x += rep+1;
+    }
+}
+
+// ----------
+
+static void read_block_v0(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp = ~get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+    if(cbp&1) residual[0] = 1 + get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    if(cbp&2) residual[1] = 1 + get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    if(cbp&4) residual[s->tstride] =   1 + get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    if(cbp&8) residual[s->tstride+1] = 1 + get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+}
+
+static void write_block_v0(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    uint8_t a = residual[0];
+    uint8_t b = residual[1];
+    uint8_t c = residual[s->tstride];
+    uint8_t d = residual[s->tstride+1];
+    int v = !a + 2*!b + 4*!c + 8*!d;
+    put_bits(&s->pb, vlcs[1].len[v], vlcs[1].bits[v]);
+    vlcs[1].stats[v]++;
+    if(a) { a-=1; put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]); vlcs[0].stats[a]++; }
+    if(b) { b-=1; put_bits(&s->pb, vlcs[0].len[b], vlcs[0].bits[b]); vlcs[0].stats[b]++; }
+    if(c) { c-=1; put_bits(&s->pb, vlcs[0].len[c], vlcs[0].bits[c]); vlcs[0].stats[c]++; }
+    if(d) { d-=1; put_bits(&s->pb, vlcs[0].len[d], vlcs[0].bits[d]); vlcs[0].stats[d]++; }
+}
+
+// ----------
+
+static void read_block_v1(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int v = get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+    int a = v & 3;
+    residual[0] = a<3 ? a-1 : 2+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = (v>>2) & 3;
+    residual[1] = a<3 ? a-1 : 2+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = (v>>4) & 3;
+    residual[s->tstride] = a<3 ? a-1 : 2+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = v>>6;
+    residual[s->tstride+1] = a<3 ? a-1 : 2+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+}
+
+static void write_block_v1(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    uint8_t a = 1+residual[0];
+    uint8_t b = 1+residual[1];
+    uint8_t c = 1+residual[s->tstride];
+    uint8_t d = 1+residual[s->tstride+1];
+    int v = FFMIN(a,3) + 4*FFMIN(b,3) + 16*FFMIN(c,3) + 64*FFMIN(d,3);
+    put_bits(&s->pb, vlcs[1].len[v], vlcs[1].bits[v]);
+    vlcs[1].stats[v]++;
+    if(a>=3) { a-=3; put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]); vlcs[0].stats[a]++; }
+    if(b>=3) { b-=3; put_bits(&s->pb, vlcs[0].len[b], vlcs[0].bits[b]); vlcs[0].stats[b]++; }
+    if(c>=3) { c-=3; put_bits(&s->pb, vlcs[0].len[c], vlcs[0].bits[c]); vlcs[0].stats[c]++; }
+    if(d>=3) { d-=3; put_bits(&s->pb, vlcs[0].len[d], vlcs[0].bits[d]); vlcs[0].stats[d]++; }
+}
+
+// ----------
+
+static void read_block_v2(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int v = get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+    int a = v & 7;
+    residual[0] = a<5 ? a-2 : 3+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = (v>>3) & 7;
+    residual[1] = a<5 ? a-2 : 3+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = (v>>6) & 7;
+    residual[s->tstride] = a<5 ? a-2 : 3+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = v>>9;
+    residual[s->tstride+1] = a<5 ? a-2 : 3+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+}
+
+static void write_block_v2(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    uint8_t a = 2+residual[0];
+    uint8_t b = 2+residual[1];
+    uint8_t c = 2+residual[s->tstride];
+    uint8_t d = 2+residual[s->tstride+1];
+    int v = FFMIN(a,5) + 6*FFMIN(b,5) + 36*FFMIN(c,5) + 216*FFMIN(d,5);
+    put_bits(&s->pb, vlcs[1].len[v], vlcs[1].bits[v]);
+    vlcs[1].stats[v]++;
+    if(a>=5) { a-=5; put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]); vlcs[0].stats[a]++; }
+    if(b>=5) { b-=5; put_bits(&s->pb, vlcs[0].len[b], vlcs[0].bits[b]); vlcs[0].stats[b]++; }
+    if(c>=5) { c-=5; put_bits(&s->pb, vlcs[0].len[c], vlcs[0].bits[c]); vlcs[0].stats[c]++; }
+    if(d>=5) { d-=5; put_bits(&s->pb, vlcs[0].len[d], vlcs[0].bits[d]); vlcs[0].stats[d]++; }
+}
+
+// ----------
+
+static void read_block_v3(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int v = get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+    int a = v & 7;
+    residual[0] = a<7 ? a-3 : 4+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = (v>>3) & 7;
+    residual[1] = a<7 ? a-3 : 4+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = (v>>6) & 7;
+    residual[s->tstride] = a<7 ? a-3 : 4+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+    a = v>>9;
+    residual[s->tstride+1] = a<7 ? a-3 : 4+get_vlc2(&s->gb, vlcs[0].vlc.table, VLC_BITS, 3);
+}
+
+static void write_block_v3(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    uint8_t a = 3+residual[0];
+    uint8_t b = 3+residual[1];
+    uint8_t c = 3+residual[s->tstride];
+    uint8_t d = 3+residual[s->tstride+1];
+    int v = FFMIN(a,7) + 8*FFMIN(b,7) + 64*FFMIN(c,7) + 512*FFMIN(d,7);
+    put_bits(&s->pb, vlcs[1].len[v], vlcs[1].bits[v]);
+    vlcs[1].stats[v]++;
+    if(a>=7) { a-=7; put_bits(&s->pb, vlcs[0].len[a], vlcs[0].bits[a]); vlcs[0].stats[a]++; }
+    if(b>=7) { b-=7; put_bits(&s->pb, vlcs[0].len[b], vlcs[0].bits[b]); vlcs[0].stats[b]++; }
+    if(c>=7) { c-=7; put_bits(&s->pb, vlcs[0].len[c], vlcs[0].bits[c]); vlcs[0].stats[c]++; }
+    if(d>=7) { d-=7; put_bits(&s->pb, vlcs[0].len[d], vlcs[0].bits[d]); vlcs[0].stats[d]++; }
+}
+
+// ----------
+
+static int read_coef_v4(FFV2Context *s, int ctx) {
+    if(ctx) {
+        int bits = get_bits(&s->gb, ctx);
+        int sign = -(bits&1);
+        int a = ((1<<ctx) + bits) >> 1;
+        a = (a^sign)-sign;
+        if(a == -127)
+            a -= get_bits(&s->gb, 1);
+        return a;
+    } else {
+        return 0;
+    }
+}
+
+static void read_block_v4(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int v = get_vlc2(&s->gb, vlcs[1].vlc.table, VLC_BITS, 3);
+    residual[0] = read_coef_v4(s, v & 7);
+    residual[1] = read_coef_v4(s, (v>>3) & 7);
+    residual[s->tstride] = read_coef_v4(s, (v>>6) & 7);
+    residual[s->tstride+1] = read_coef_v4(s, v>>9);
+}
+
+#define WRITE_COEF_V4(i, coef) {\
+    int x, a, b, sign;\
+    a = x = (int8_t)(coef);\
+    if(x) {\
+        if(x == -128) a = -127;\
+        sign = x>>7;\
+        a = (a^sign)-sign;\
+        b = 1+av_log2(a);\
+        bits <<= b;\
+        bits |= ((a<<1)-(1<<b)) | (sign&1);\
+        count += b;\
+        v |= b<<(3*i);\
+        if(x <= -127) {\
+            bits = (bits<<1) | ((x&1)^1);\
+            count++;\
+        }\
+    }\
+}
+
+static void write_block_v4(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int v = 0, bits = 0, count = 0;
+    WRITE_COEF_V4(0, residual[0]);
+    WRITE_COEF_V4(1, residual[1]);
+    WRITE_COEF_V4(2, residual[s->tstride]);
+    WRITE_COEF_V4(3, residual[s->tstride+1]);
+    put_bits(&s->pb, vlcs[1].len[v], vlcs[1].bits[v]);
+    vlcs[1].stats[v]++;
+    if(count)
+        put_bits(&s->pb, count, bits);
+}
+
+// ----------
+
+static void read_tree_v2_4x4(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp = ~get_vlc2(&s->gb, vlcs[2].vlc.table, VLC_BITS, 3);
+    if(cbp&1) s->read_block(s, vlcs, residual);
+    if(cbp&2) s->read_block(s, vlcs, residual+2);
+    if(cbp&4) s->read_block(s, vlcs, residual+2*s->tstride);
+    if(cbp&8) s->read_block(s, vlcs, residual+2*s->tstride+2);
+}
+
+static void read_tree_v2(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp, y;
+    for(y=0; y<8; y++)
+        *(uint64_t*)(residual+y*s->tstride) = 0;
+    cbp = ~get_vlc2(&s->gb, vlcs[3].vlc.table, VLC_BITS, 3);
+    if(cbp&1) read_tree_v2_4x4(s, vlcs, residual);
+    if(cbp&2) read_tree_v2_4x4(s, vlcs, residual+4);
+    if(cbp&4) read_tree_v2_4x4(s, vlcs, residual+4*s->tstride);
+    if(cbp&8) read_tree_v2_4x4(s, vlcs, residual+4*s->tstride+4);
+}
+
+static int cbp_8x8_zigzag(uint8_t *res, int stride) {
+    int cbp;
+    cbp  = !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0));
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 1;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 4;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 5;
+    res += 2*stride;
+    cbp |= !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 2;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 3;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 6;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 7;
+    res += 2*stride;
+    cbp |= !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 8;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 9;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 12;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 13;
+    res += 2*stride;
+    cbp |= !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 10;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 11;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 14;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 15;
+    return cbp;
+}
+
+static void write_tree_v2(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp, v, i;
+    cbp = cbp_8x8_zigzag(residual, s->tstride);
+    v = !(cbp&0xf) + 2*!(cbp&0xf0) + 4*!(cbp&0xf00) + 8*!(cbp&0xf000);
+    put_bits(&s->pb, vlcs[3].len[v], vlcs[3].bits[v]);
+    vlcs[3].stats[v]++;
+    for(i=0; cbp; i++, cbp>>=4) {
+        if(cbp&15) {
+            uint8_t *r = residual+4*(i&1)+4*(i>>1)*s->tstride;
+            v = (cbp&15)^15;
+            put_bits(&s->pb, vlcs[2].len[v], vlcs[2].bits[v]);
+            vlcs[2].stats[v]++;
+            if(cbp&1) s->write_block(s, vlcs, r);
+            if(cbp&2) s->write_block(s, vlcs, r+2);
+            if(cbp&4) s->write_block(s, vlcs, r+2*s->tstride);
+            if(cbp&8) s->write_block(s, vlcs, r+2*s->tstride+2);
+        }
+    }
+}
+
+// ----------
+
+static void read_tree_v3(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp, y;
+    for(y=0; y<4; y++)
+        *(uint64_t*)(residual+y*s->tstride) = 0;
+    cbp = get_vlc2(&s->gb, vlcs[2].vlc.table, VLC_BITS, 3);
+    if(cbp&0x01) s->read_block(s, vlcs, residual);
+    if(cbp&0x02) s->read_block(s, vlcs, residual+2);
+    if(cbp&0x04) s->read_block(s, vlcs, residual+4);
+    if(cbp&0x08) s->read_block(s, vlcs, residual+6);
+    if(cbp&0x10) s->read_block(s, vlcs, residual+2*s->tstride);
+    if(cbp&0x20) s->read_block(s, vlcs, residual+2*s->tstride+2);
+    if(cbp&0x40) s->read_block(s, vlcs, residual+2*s->tstride+4);
+    if(cbp&0x80) s->read_block(s, vlcs, residual+2*s->tstride+6);
+}
+
+static int cbp_8x4_raster(uint8_t *res, int stride) {
+    int cbp;
+    cbp  = !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 0;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 1;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 2;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 3;
+    res += 2*stride;
+    cbp |= !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 4;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 5;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 6;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 7;
+    return cbp;
+}
+
+static void write_tree_v3(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp, i;
+    cbp = cbp_8x4_raster(residual, s->tstride);
+    put_bits(&s->pb, vlcs[2].len[cbp], vlcs[2].bits[cbp]);
+    vlcs[2].stats[cbp]++;
+    for(i=0; cbp; i++, cbp>>=1)
+        if(cbp&1)
+            s->write_block(s, vlcs, residual+2*(i&3)+2*(i>>2)*s->tstride);
+}
+
+// ----------
+
+static void read_lines(FFV2Context *s, uint8_t *buf, int width, int height, int plane) {
+    int x, y;
+    if(s->coder_tree == 0) {
+        for(y=0; y<height; y++)
+            s->read_line(s, s->vlcs[plane], buf+y*s->tstride, width);
+    } else if(s->coder_tree == 1) {
+        if(s->coder_block == 0)
+            memset(buf, 0, 8*s->tstride);
+        for(y=0; y<height; y+=2)
+            for(x=0; x<width; x+=2)
+                s->read_block(s, s->vlcs[plane], buf+x+y*s->tstride);
+    } else if(s->coder_tree == 2) {
+        for(x=0; x<width; x+=8)
+            read_tree_v2(s, s->vlcs[plane], buf+x);
+    } else if(s->coder_tree == 3) {
+        for(y=0; y<height; y+=4)
+            for(x=0; x<width; x+=8)
+                read_tree_v3(s, s->vlcs[plane], buf+x+y*s->tstride);
+    } else {
+        abort();
+    }
+}
+
+static void write_lines(FFV2Context *s, uint8_t *buf, int width, int height, int plane) {
+    int x, y;
+    if(s->coder_tree == 0) {
+        for(y=0; y<height; y++)
+            s->write_line(s, s->vlcs[plane], buf+y*s->tstride, width);
+    } else if(s->coder_tree == 1) {
+        for(y=0; y<height; y+=2)
+            for(x=0; x<width; x+=2)
+                s->write_block(s, s->vlcs[plane], buf+x+y*s->tstride);
+    } else if(s->coder_tree == 2) {
+        for(x=0; x<width; x+=8)
+            write_tree_v2(s, s->vlcs[plane], buf+x);
+    } else if(s->coder_tree == 3) {
+        for(y=0; y<height; y+=4)
+            for(x=0; x<width; x+=8)
+                write_tree_v3(s, s->vlcs[plane], buf+x+y*s->tstride);
+    } else {
+        abort();
+    }
+}
+
+static void read_mvd(FFV2Context *s, int16_t *mv, int plane) {
+    VLCS *vlcs = &s->vlcs[plane*2+1][4];
+    int v = get_vlc2(&s->gb, vlcs->vlc.table, VLC_BITS, 3);
+    int d;
+    mv[0] += (v&15)<15 ? (v&15)-7
+           : (d=(int8_t)get_bits(&s->gb, 8)) != -128 ? d
+           : (int16_t)get_bits(&s->gb, 16);
+    v >>= 4;
+    mv[1] += (v&15)<15 ? (v&15)-7
+           : (d=(int8_t)get_bits(&s->gb, 8)) != -128 ? d
+           : (int16_t)get_bits(&s->gb, 16);
+}
+
+static void write_mvd(FFV2Context *s, int mvdx, int mvdy, int plane) {
+    VLCS *vlcs = &s->vlcs[plane*2+1][4];
+    int x = FFMIN(15, (unsigned)(mvdx+7));
+    int y = FFMIN(15, (unsigned)(mvdy+7));
+    int v = x+(y<<4);
+    put_bits(&s->pb, vlcs->len[v], vlcs->bits[v]);
+    vlcs->stats[v]++;
+    if(x==15) {
+        if((unsigned)(mvdx+127) < 255)
+            put_bits(&s->pb, 8, mvdx&0xff);
+        else
+            put_bits(&s->pb, 24, (128<<16)+(mvdx&0xffff));
+    }
+    if(y==15) {
+        if((unsigned)(mvdy+127) < 255)
+            put_bits(&s->pb, 8, mvdy&0xff);
+        else
+            put_bits(&s->pb, 24, (128<<16)+(mvdy&0xffff));
+    }
+}
+
+#define proc_table_sparse(id, size, map) {\
+    if(read) err |= read_table(s, &s->vlcs[plane][id], size, map);\
+    else write_table(s, &s->vlcs[plane][id], size);\
+}
+#define proc_table(id, size) proc_table_sparse(id, size, NULL)
+
+static int proc_tables(FFV2Context *s, int read, int plane) {
+    int err = 0;
+    if(!s->fenc.key_frame && !(s->fref.key_frame && (plane&1))) {
+        // I-frames must contain I-tables, and the first P-frame in a GOP must contain P-tables. the rest are optional.
+        int present;
+        if(read) present = get_bits(&s->gb, 8);
+        else     present = *(s->bs++) = !(s->gop&3);
+        if(!present)
+            return 0;
+    }
+    if(plane&1) {
+        proc_table(4, 256);
+        return err;
+    }
+    if(s->coder_type == 1) {
+        s->read_line = read_line_v0;
+        s->write_line = write_line_v0;
+        proc_table(0, 256);
+    } else if(s->coder_type == 2) {
+        s->read_line = read_line_v1;
+        s->write_line = write_line_v1;
+        proc_table(0, 512);
+        proc_table(1, 240);
+    } else if(s->coder_type == 3) {
+        s->read_line = read_line_v2;
+        s->write_line = write_line_v2;
+        proc_table(0, 495);
+        proc_table(1, 241);
+    } else {
+        switch(s->coder_block) {
+            case 0: // merge 0s
+                s->read_block = read_block_v0;
+                s->write_block = write_block_v0;
+                proc_table(0, 255);
+                proc_table(1, 15 + (s->coder_tree==1));
+                break;
+            case 1: // merge [-1,1]
+                s->read_block = read_block_v1;
+                s->write_block = write_block_v1;
+                proc_table(0, 253);
+                proc_table(1, 256); // should be 255 in the tree case, but the missing value is in the middle
+                break;
+            case 2: // merge [-2,2]
+                s->read_block = read_block_v2;
+                s->write_block = write_block_v2;
+                proc_table(0, 251);
+                proc_table_sparse(1, 1296, map_coder6);
+                break;
+            case 3: // merge [-3,3]
+                s->read_block = read_block_v3;
+                s->write_block = write_block_v3;
+                proc_table(0, 249);
+                proc_table(1, 4096);
+                break;
+            case 4: // merge log2
+                s->read_block = read_block_v4;
+                s->write_block = write_block_v4;
+                proc_table(1, 4096);
+                break;
+            default:
+                return -2;
+        }
+        switch(s->coder_tree) {
+            case 1: // no tree
+                break;
+            case 2: // 8x8 quadtree
+                proc_table(2, 15);
+                proc_table(3, 16);
+                break;
+            case 3: // 8x4 octree
+                proc_table(2, 256);
+                break;
+            default:
+                return -2;
+        }
+    }
+    return err;
+}
+
+static int sum_abs_int8(int8_t *buf, intptr_t stride) {
+    int sum=0;
+#if HAVE_MMX
+    DECLARE_ALIGNED_8(static const uint64_t, pb_128) = 0x8080808080808080ULL;
+    __asm__ volatile(
+        "movq  %4,        %%mm7 \n"
+        "movq  (%0),      %%mm0 \n"
+        "movq  (%0,%2),   %%mm1 \n"
+        "movq  (%0,%2,2), %%mm2 \n"
+        "movq  (%0,%3),   %%mm3 \n"
+        "lea   (%0,%2,4), %0    \n"
+        "paddb     %%mm7, %%mm0 \n"
+        "paddb     %%mm7, %%mm1 \n"
+        "paddb     %%mm7, %%mm2 \n"
+        "paddb     %%mm7, %%mm3 \n"
+        "psadbw    %%mm7, %%mm0 \n"
+        "psadbw    %%mm7, %%mm1 \n"
+        "psadbw    %%mm7, %%mm2 \n"
+        "psadbw    %%mm7, %%mm3 \n"
+        "paddw     %%mm2, %%mm0 \n"
+        "paddw     %%mm3, %%mm1 \n"
+        "movq  (%0),      %%mm4 \n"
+        "movq  (%0,%2),   %%mm5 \n"
+        "movq  (%0,%2,2), %%mm2 \n"
+        "movq  (%0,%3),   %%mm3 \n"
+        "paddb     %%mm7, %%mm4 \n"
+        "paddb     %%mm7, %%mm5 \n"
+        "paddb     %%mm7, %%mm2 \n"
+        "paddb     %%mm7, %%mm3 \n"
+        "psadbw    %%mm7, %%mm4 \n"
+        "psadbw    %%mm7, %%mm5 \n"
+        "psadbw    %%mm7, %%mm2 \n"
+        "psadbw    %%mm7, %%mm3 \n"
+        "paddw     %%mm4, %%mm0 \n"
+        "paddw     %%mm5, %%mm1 \n"
+        "paddw     %%mm2, %%mm0 \n"
+        "paddw     %%mm3, %%mm1 \n"
+        "paddw     %%mm1, %%mm0 \n"
+        "movd      %%mm0, %1    \n"
+        :"+&r"(buf), "=r"(sum)
+        :"r"(stride), "r"(stride*3), "m"(pb_128)
+    );
+#else
+    int x, y;
+    for(y=0; y<8; y++, buf+=stride)
+        for(x=0; x<8; x++)
+            sum += abs(buf[x]);
+#endif
+    return sum;
+}
+
+static void sub_block(uint8_t *dst, uint8_t *src, uint8_t *ref, intptr_t stride) {
+    __asm__ volatile(
+        "movq  (%1),      %%mm0 \n"
+        "movq  (%1,%3),   %%mm1 \n"
+        "movq  (%1,%3,2), %%mm2 \n"
+        "movq  (%1,%4),   %%mm3 \n"
+        "psubb (%2),      %%mm0 \n"
+        "psubb (%2,%3),   %%mm1 \n"
+        "psubb (%2,%3,2), %%mm2 \n"
+        "psubb (%2,%4),   %%mm3 \n"
+        "movq  %%mm0, (%0)      \n"
+        "movq  %%mm1, (%0,%3)   \n"
+        "movq  %%mm2, (%0,%3,2) \n"
+        "movq  %%mm3, (%0,%4)   \n"
+        "lea   (%0,%3,4), %0    \n"
+        "lea   (%1,%3,4), %1    \n"
+        "lea   (%2,%3,4), %2    \n"
+        "movq  (%1),      %%mm0 \n"
+        "movq  (%1,%3),   %%mm1 \n"
+        "movq  (%1,%3,2), %%mm2 \n"
+        "movq  (%1,%4),   %%mm3 \n"
+        "psubb (%2),      %%mm0 \n"
+        "psubb (%2,%3),   %%mm1 \n"
+        "psubb (%2,%3,2), %%mm2 \n"
+        "psubb (%2,%4),   %%mm3 \n"
+        "movq  %%mm0, (%0)      \n"
+        "movq  %%mm1, (%0,%3)   \n"
+        "movq  %%mm2, (%0,%3,2) \n"
+        "movq  %%mm3, (%0,%4)   \n"
+        :"+&r"(dst), "+&r"(src), "+&r"(ref)
+        :"r"(stride), "r"(stride*3)
+        :"memory"
+    );
+}
+
+#define CHECK_MV(x, y) {\
+    int cost = s->dsp.sad[1](NULL, src, ref+(x)+(y)*stride, stride, 8);\
+    COPY3_IF_LT(bcost, cost, bmx, x, bmy, y);\
+}
+
+static void motion_search(FFV2Context *s, uint8_t *src, uint8_t *ref, int stride, int16_t (*mv_plane)[2], int mv_stride, int16_t *mv_out, int *mv_min, int *mv_max) {
+    int x, y, bmx=0, bmy=0, omx, omy;
+    int bcost = s->dsp.sad[1](NULL, src, ref, stride, 8);
+    for(y=-1; y<=1; y++)
+        for(x=-1; x<=1; x++) {
+            int mx = av_clip(mv_plane[x+y*mv_stride][0], mv_min[0]+1, mv_max[0]-1);
+            int my = av_clip(mv_plane[x+y*mv_stride][1], mv_min[1]+1, mv_max[1]-1);
+            if((mx|my) && ((mx-bmx)|(my-bmy)))
+                CHECK_MV(mx, my);
+        }
+    while(1) {
+        omx=bmx; omy=bmy;
+        CHECK_MV(omx-1, omy);
+        CHECK_MV(omx+1, omy);
+        CHECK_MV(omx, omy-1);
+        CHECK_MV(omx, omy+1);
+        if(bmx==omx && bmy==omy)
+            break;
+        if(bmx<=mv_min[0] || bmx>=mv_max[0] || bmy<=mv_min[1] || bmy>=mv_max[1])
+            break;
+    }
+    mv_plane[0][0] = bmx;
+    mv_plane[0][1] = bmy;
+    if(bmx>mv_min[0] && bmx<mv_max[0] && bmy>mv_min[1] && bmy<mv_max[1]) {
+        uint8_t *r = ref+bmx+bmy*stride;
+        int cost;
+        bmx <<= 1;
+        bmy <<= 1;
+        omx=bmx; omy=bmy;
+        cost = s->dsp.pix_abs[1][1](NULL, src, r-1, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx-1, bmy, omy);
+        cost = s->dsp.pix_abs[1][1](NULL, src, r, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx+1, bmy, omy);
+        cost = s->dsp.pix_abs[1][2](NULL, src, r-stride, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx, bmy, omy-1);
+        cost = s->dsp.pix_abs[1][2](NULL, src, r, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx, bmy, omy+1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r-stride-1, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx-1, bmy, omy-1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r-stride, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx+1, bmy, omy-1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r-1, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx-1, bmy, omy+1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx+1, bmy, omy+1);
+    }
+    mv_out[0] = bmx;
+    mv_out[1] = bmy;
+}
+
+static void add_ffv2_median_prediction(uint8_t *dst, uint8_t *top, uint8_t *residual, int width, int l, int tl) {
+    int x;
+    for(x=0; x<width; x++) {
+        int t = (int8_t)top[x];
+        dst[x] = l += mid_pred(0, (int8_t)(t-l), (int8_t)(t-tl)) + residual[x];
+        l = (int8_t)l;
+        tl = t;
+    }
+}
+
+static void sub_ffv2_median_prediction(uint8_t *dst, uint8_t *top, uint8_t *src, int width, int l, int tl) {
+    intptr_t x=0;
+#if HAVE_MMX
+    DECLARE_ALIGNED_8(static const uint64_t, pb_128) = 0x8080808080808080ULL;
+    __asm__ volatile(
+        "movq         %5, %%mm7 \n"
+        "1: \n"
+        "movq    (%2,%0), %%mm0 \n"
+        "paddb     %%mm7, %%mm0 \n"
+        "movq  -1(%1,%0), %%mm4 \n"
+        "movq      %%mm0, %%mm1 \n"
+        "psubb -1(%2,%0), %%mm0 \n" // t-tl+128
+        "psubb     %%mm4, %%mm1 \n" // t-l+128
+        "movq    (%1,%0), %%mm5 \n"
+        "movq      %%mm0, %%mm2 \n"
+        "pminub    %%mm7, %%mm0 \n"
+        "pmaxub    %%mm7, %%mm2 \n"
+        "pmaxub    %%mm0, %%mm1 \n"
+        "pminub    %%mm2, %%mm1 \n"
+        "paddb     %%mm7, %%mm5 \n"
+        "psubb     %%mm4, %%mm5 \n" // v-l+128
+        "psubb     %%mm1, %%mm5 \n" // v-l-median(0,t-l,t-tl)
+        "movq      %%mm5, (%3,%0) \n"
+        "add $8, %0 \n"
+        "cmp %4, %0 \n"
+        "jb 1b \n"
+        : "+r" (x)
+        : "r"(src), "r"(top), "r"(dst), "r"((intptr_t)width), "m"(pb_128)
+    );
+    dst[0] = src[0] - l - mid_pred(0, (int8_t)(top[0]-l), (int8_t)(top[0]-tl));
+#else
+    for(x=0; x<width; x++) {
+        int t = (int8_t)top[x];
+        int v = (int8_t)src[x];
+        dst[x] = v - l - mid_pred(0, (int8_t)(t-l), (int8_t)(t-tl));
+        l = v;
+        tl = t;
+    }
+#endif
+}
+
+DECLARE_ALIGNED_16(static uint8_t, zero[4096]); // FIXME size
+
+static void defilter_lines(FFV2Context *s, uint8_t *dst, uint8_t *src, uint8_t *ref, int stride, int width, int height, int y0) {
+    int x, y, v, i;
+    uint8_t *topi = y0 ? dst-stride : zero;
+    uint8_t *topp = y0 ? src-stride : zero;
+    if(s->fenc.key_frame) {
+        for(y=0; y<height; y++) {
+            uint8_t *pd = dst + y*stride;
+            uint8_t *ps = src + y*stride;
+            v = topi[0];
+            add_ffv2_median_prediction(pd, topi, ps, width, v, v);
+            topi = pd;
+        }
+    } else {
+        for(y=0; y<height; y++, topi=dst, topp=src, dst+=stride, src+=stride, ref+=stride) {
+            for(x=0; x<width; x+=8) {
+                if(!s->mb_types[x>>3]) {
+                    // filter an intra block, with decoded samples as neighbors.
+                    // then compute what would have been the inter residual, and write that back to the residual buffer.
+                    int l = x ? dst[x-1] : 0;
+                    int tl = x ? topi[x-1] : 0;
+                    add_ffv2_median_prediction(dst+x, topi+x, src+x, 8, l, tl);
+                    for(i=0; i<8; i++)
+                        src[x+i] = dst[x+i] - ref[x+i];
+                } else {
+                    // filter an inter block, with inter residuals as neighbors, and add inter prediction.
+                    int l = x ? src[x-1] : 0;
+                    int tl = x ? topp[x-1] : 0;
+                    add_ffv2_median_prediction(src+x, topp+x, src+x, 8, l, tl);
+                    for(i=0; i<8; i++)
+                        dst[x+i] = src[x+i] + ref[x+i];
+                }
+            }
+        }
+    }
+}
+
+static void enfilter_lines(FFV2Context *s, uint8_t *dst, uint8_t *src, int stride, int width, int height, int y0) {
+    int y=0, v;
+    if(!y0) {
+        s->dsp.diff_bytes(dst, src, src-1, width);
+        dst[0] = src[0];
+        y = 1;
+    }
+    for(; y<height; y++) {
+        uint8_t *pd = dst + y*stride;
+        uint8_t *ps = src + y*stride;
+        v = ps[-stride];
+        sub_ffv2_median_prediction(pd, ps-stride, ps, width, v, v);
+    }
+    if(height < 8)
+        memset(dst+height*stride, 0, (8-height)*stride);
+    if(width & 7)
+        for(y=0; y<8; y++)
+            memset(dst+y*stride+width, 0, 8-(width&7));
+}
+
+static void enfilter_pblock(FFV2Context *s, uint8_t *dst, uint8_t *src, int stride) {
+    int y=0;
+    for(y=0; y<8; y++, dst+=stride, src+=stride) {
+        int l = src[-1];
+        int tl = src[-stride-1];
+        sub_ffv2_median_prediction(dst, src-stride, src, 8, l, tl);
+    }
+}
+
+static int decode_plane(FFV2Context *s, int plane) {
+    int stride = s->fenc.linesize[plane];
+    int width = s->avctx->width >> !!plane;
+    int height = s->avctx->height >> !!plane;
+    int mb_width = (width+7)>>3;
+    int mb_height = (height+7)>>3;
+    uint8_t *residual = s->temp[0] + stride + 8;
+    uint8_t *pmc = s->temp[1] + stride;
+    int x, y;
+    s->tstride = stride;
+
+    init_get_bits(&s->gb, s->bs, (s->bs_end - s->bs)*8);
+    if(proc_tables(s, 1, plane*2) < 0)
+        return -1;
+    if(!s->fenc.key_frame && proc_tables(s, 1, plane*2+1) < 0)
+        return -1;
+
+    if(!s->fenc.key_frame)
+        s->dsp.draw_edges(s->fref.data[plane], stride, width, height, 8);
+    memset(residual-stride-8, 0, stride+8);
+    memset(s->mvs_top, 0, mb_width*sizeof(*s->mvs_top));
+
+    for(y=0; y<height; y+=8) {
+        uint8_t *dst = s->fenc.data[plane] + y*stride;
+        uint8_t *ref = s->fref.data[plane] + y*stride;
+        if(s->fenc.key_frame) {
+            read_lines(s, residual, width, FFMIN(8,height-y), plane*2);
+            defilter_lines(s, dst, residual, NULL, stride, width, FFMIN(8,height-y), y);
+        } else {
+            for(x=0; x<mb_width; x++) {
+                int mvx, mvy;
+                s->mb_types[x] = get_bits(&s->gb, 1);
+                s->mvs[x][0] = mid_pred(s->mvs[x-1][0], s->mvs_top[x][0], s->mvs_top[x+1][0]);
+                s->mvs[x][1] = mid_pred(s->mvs[x-1][1], s->mvs_top[x][1], s->mvs_top[x+1][1]);
+                if(s->mb_types[x])
+                    read_mvd(s, s->mvs[x], plane);
+                mvx = av_clip(s->mvs[x][0], -16*x-16, (mb_width-x)*16);
+                mvy = av_clip(s->mvs[x][1], -2*y-16, mb_height*16-2*y);
+                s->dsp.put_pixels_tab[1][(mvx&1)+2*(mvy&1)](pmc+8*x, ref+8*x+(mvx>>1)+(mvy>>1)*stride, stride, 8);
+            }
+            read_lines(s, residual, width, FFMIN(8,height-y), plane*2);
+            defilter_lines(s, dst, residual, pmc, stride, width, FFMIN(8,height-y), y);
+            memcpy(residual-stride, residual+7*stride, stride);
+            FFSWAP(void*, s->mvs, s->mvs_top);
+        }
+    }
+
+    s->bs += (get_bits_count(&s->gb)+7)>>3;
+    return 0;
+}
+
+#undef rand
+
+static void encode_plane(FFV2Context *s, int plane) {
+    int stride = s->fenc.linesize[plane];
+    int width = s->avctx->width >> !!plane;
+    int height = s->avctx->height >> !!plane;
+    int mb_width = (width+7)>>3;
+    int mb_height = (height+7)>>3;
+    int mv_stride = mb_width+2;
+    uint8_t *imed = s->temp[0] + stride;  ///< median-filtered intra samples
+    uint8_t *pmc = s->temp[1] + stride;   ///< inter prediction
+    uint8_t *pdiff = s->temp[2] + stride + 8; ///< inter residual
+    uint8_t *pmed = s->temp[3] + stride;  ///< median-filtered inter residual
+    int x, y;
+    s->tstride = stride;
+
+    proc_tables(s, 0, plane*2);
+    if(!s->fenc.key_frame)
+        proc_tables(s, 0, plane*2+1);
+    init_put_bits(&s->pb, s->bs, (s->bs_end - s->bs)*8);
+
+    if(!s->fenc.key_frame)
+        s->dsp.draw_edges(s->fref.data[plane], stride, width, height, 8);
+    memset(pdiff-stride-8, 0, 9*stride+8);
+    memset(s->mvs_top, 0, mb_width*sizeof(*s->mvs_top));
+
+    for(y=0; y<height; y+=8) {
+        uint8_t *src = s->fenc.data[plane] + y*stride;
+        uint8_t *ref = s->fref.data[plane] + y*stride;
+        enfilter_lines(s, imed, src, stride, width, FFMIN(8,height-y), y);
+        if(!s->fenc.key_frame) {
+            int16_t (*mv_plane)[2] = s->mv_plane[!!plane] + (y>>3)*mv_stride;
+            int mvx, mvy, mvpx, mvpy, isad, psad, type;
+            int mv_min[2] = {0, -y-8};
+            int mv_max[2] = {0, mb_height*8-y};
+            for(x=0; x<mb_width; x++) {
+                mv_min[0] = -8*x-8;
+                mv_max[0] = (mb_width-x)*8;
+                mvpx = mid_pred(s->mvs[x-1][0], s->mvs_top[x][0], s->mvs_top[x+1][0]);
+                mvpy = mid_pred(s->mvs[x-1][1], s->mvs_top[x][1], s->mvs_top[x+1][1]);
+                motion_search(s, src+8*x, ref+8*x, stride, mv_plane, mv_stride, s->mvs[x], mv_min, mv_max);
+                mvx = s->mvs[x][0];
+                mvy = s->mvs[x][1];
+                s->dsp.put_pixels_tab[1][(mvx&1)+2*(mvy&1)](pmc+8*x, ref+8*x+(mvx>>1)+(mvy>>1)*stride, stride, 8);
+                sub_block(pdiff+8*x, src+8*x, pmc+8*x, stride);
+                enfilter_pblock(s, pmed+8*x, pdiff+8*x, stride);
+                isad = sum_abs_int8(imed+8*x, stride);
+                psad = sum_abs_int8(pmed+8*x, stride);
+                type = s->mb_types[x] = psad < isad;
+                put_bits(&s->pb, 1, type);
+                if(type) {
+                    write_mvd(s, mvx-mvpx, mvy-mvpy, plane);
+                    s->dsp.put_pixels_tab[1][0](imed+8*x, pmed+8*x, stride, 8);
+                } else {
+                    s->mvs[x][0] = mvpx;
+                    s->mvs[x][1] = mvpy;
+                    mvx = av_clip(mvpx, 2*mv_min[0], 2*mv_max[0]);
+                    mvy = av_clip(mvpy, 2*mv_min[1], 2*mv_max[1]);
+                    s->dsp.put_pixels_tab[1][(mvx&1)+2*(mvy&1)](pmc+8*x, ref+8*x+(mvx>>1)+(mvy>>1)*stride, stride, 8);
+                    sub_block(pdiff+8*x, src+8*x, pmc+8*x, stride);
+                }
+            }
+            memcpy(pdiff-stride, pdiff+7*stride, stride);
+            FFSWAP(void*, s->mvs, s->mvs_top);
+        }
+        write_lines(s, imed, width, FFMIN(8,height-y), plane*2);
+    }
+
+    s->bs += (put_bits_count(&s->pb)+7)>>3;
+    flush_put_bits(&s->pb);
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, const uint8_t *buf, int buf_size) {
+    FFV2Context *s = avctx->priv_data;
+    int err = 0;
+    s->bs = buf;
+    s->bs_end = buf + buf_size;
+
+    if(s->bs[0]) {
+        s->fenc.key_frame = 1;
+        s->fenc.pict_type = FF_I_TYPE;
+        s->coder_type = s->bs[0];
+        s->coder_tree = s->coder_type >> 3;
+        s->coder_block = s->coder_type & 7;
+        memset(s->mb_types, 0, s->mb_width);
+    } else {
+        FFSWAP(AVFrame, s->fenc, s->fref);
+        s->fenc.key_frame = 0;
+        s->fenc.pict_type = FF_P_TYPE;
+    }
+    s->bs++;
+
+    err |= decode_plane(s, 0);
+    err |= decode_plane(s, 1);
+    err |= decode_plane(s, 2);
+
+    *(AVFrame*)data = s->fenc;
+    *data_size = sizeof(s->fenc);
+    return err ? err : s->bs - buf;
+}
+
+static int encode_frame(AVCodecContext *avctx, uint8_t *buf, int buf_size, void *data) {
+    FFV2Context *s = avctx->priv_data;
+    s->bs = buf;
+    s->bs_end = buf + buf_size;
+
+    if(s->gop >= avctx->gop_size)
+        s->gop = 0;
+    if(s->gop == 0) {
+        memset(s->mb_types, 0, s->mb_width);
+    } else {
+        FFSWAP(AVFrame, s->fenc, s->fref);
+    }
+    s->fenc.key_frame = !s->gop;
+    s->fenc.pict_type = s->fenc.key_frame ? FF_I_TYPE : FF_P_TYPE;
+    *(s->bs++) = s->fenc.key_frame ? s->coder_type : 0;
+
+    // FIXME skip in intra-only mode
+    av_picture_copy((AVPicture*)&s->fenc, (AVPicture*)data, PIX_FMT_YUV420P, avctx->width, avctx->height);
+
+    encode_plane(s, 0);
+    encode_plane(s, 1);
+    encode_plane(s, 2);
+
+    if(!s->initted_vlc[s->fenc.key_frame]) {
+        // re-encode with updated vlc
+        // FIXME do this whenever stats change enough?
+        s->initted_vlc[s->fenc.key_frame] = 1;
+        s->bs = buf+1;
+        encode_plane(s, 0);
+        encode_plane(s, 1);
+        encode_plane(s, 2);
+    }
+
+    s->gop++;
+    avctx->coded_frame = &s->fenc;
+    if(s->bs_end - s->bs < 4)
+        return -1;
+    return s->bs - buf;
+}
+
+AVCodec ffv2_decoder = {
+    "ffv2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFV2,
+    sizeof(FFV2Context),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    CODEC_CAP_DR1,
+    NULL,
+};
+
+AVCodec ffv2_encoder = {
+    "ffv2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFV2,
+    sizeof(FFV2Context),
+    encode_init,
+    encode_frame,
+    encode_end,
+    .pix_fmts = (enum PixelFormat[]){PIX_FMT_YUV420P},
+};
+
diff --git a/libavformat/riff.c b/libavformat/riff.c
index dfdc0fd..5e4772e 100644
--- a/libavformat/riff.c
+++ b/libavformat/riff.c
@@ -148,6 +148,7 @@ const AVCodecTag codec_bmp_tags[] = {
     { CODEC_ID_ASV2,         MKTAG('A', 'S', 'V', '2') },
     { CODEC_ID_VCR1,         MKTAG('V', 'C', 'R', '1') },
     { CODEC_ID_FFV1,         MKTAG('F', 'F', 'V', '1') },
+    { CODEC_ID_FFV2,         MKTAG('F', 'F', 'V', '2') },
     { CODEC_ID_XAN_WC4,      MKTAG('X', 'x', 'a', 'n') },
     { CODEC_ID_MIMIC,        MKTAG('L', 'M', '2', '0') },
     { CODEC_ID_MSRLE,        MKTAG('m', 'r', 'l', 'e') },