diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 93420db..fa1be61 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -119,6 +119,8 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER)    += 8svx.o
 OBJS-$(CONFIG_ESCAPE124_DECODER)       += escape124.o
 OBJS-$(CONFIG_FFV1_DECODER)            += ffv1.o rangecoder.o
 OBJS-$(CONFIG_FFV1_ENCODER)            += ffv1.o rangecoder.o
+OBJS-$(CONFIG_FFV2_DECODER)            += ffv2.o rangecoder.o
+OBJS-$(CONFIG_FFV2_ENCODER)            += ffv2.o rangecoder.o
 OBJS-$(CONFIG_FFVHUFF_DECODER)         += huffyuv.o
 OBJS-$(CONFIG_FFVHUFF_ENCODER)         += huffyuv.o
 OBJS-$(CONFIG_FLAC_DECODER)            += flacdec.o flacdata.o flac.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 5dbf1dc..937e808 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -99,6 +99,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER (EIGHTSVX_FIB, eightsvx_fib);
     REGISTER_DECODER (ESCAPE124, escape124);
     REGISTER_ENCDEC  (FFV1, ffv1);
+    REGISTER_ENCDEC  (FFV2, ffv2);
     REGISTER_ENCDEC  (FFVHUFF, ffvhuff);
     REGISTER_ENCDEC  (FLASHSV, flashsv);
     REGISTER_DECODER (FLIC, flic);
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 43a0695..d472c20 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -97,6 +97,7 @@ enum CodecID {
     CODEC_ID_ASV1,
     CODEC_ID_ASV2,
     CODEC_ID_FFV1,
+    CODEC_ID_FFV2, // FIXME should go at the end, but that would force manual merges
     CODEC_ID_4XM,
     CODEC_ID_VCR1,
     CODEC_ID_CLJR,
diff --git a/libavcodec/ffv2.c b/libavcodec/ffv2.c
new file mode 100644
index 0000000..8b41d15
--- /dev/null
+++ b/libavcodec/ffv2.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (C) 2010 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define DEBUG
+#include "avcodec.h"
+#include "dsputil.h"
+#include "mathops.h"
+#include "mpegvideo.h"
+#include "rectangle.h"
+#include "ffv2dsp.c"
+
+#define VLC_BITS 11
+#define NUM_TABLES 19
+#define NUM_VLCS 1296
+
+#define VLC_COEF_BLOCK 0  // (0..7)*2
+#define VLC_COEF_ESCAPE 1 // (0..7)*2+1
+#define VLC_CBP 16
+#define VLC_MV 17
+#define VLC_MBTYPE 18
+
+typedef struct {
+    VLC vlc;
+    uint8_t len[NUM_VLCS];
+    uint32_t bits[NUM_VLCS];
+    uint32_t stats[NUM_VLCS];
+} VLCS;
+
+typedef struct FFV2Context FFV2Context;
+struct FFV2Context {
+    AVCodecContext *avctx;
+    DSPContext dsp;
+    GetBitContext gb;
+    PutBitContext pb;
+    uint8_t *bs, *bs_end;
+    VLCS vlcs[2][NUM_TABLES];
+    VLCS metavlc;
+    AVFrame fenc;
+    AVFrame fref;
+    int mb_width, mb_height;
+    int tstride;
+    uint8_t *temp[4];
+    uint8_t *mb_types; ///< intra: 0, inter: 1
+    int16_t (*mvs)[2]; ///< current row
+    int16_t (*mvs_top)[2]; ///< previous row
+    int16_t (*mvs_base)[2]; ///< buffer that the other mv arrays point into
+    int16_t (*mvps)[2];
+    int16_t (*mv_plane[2])[2]; ///< mvs chosen by motion est, not necessarily those that will be coded
+    int16_t (*mv_plane_base)[2];
+    int gop;
+    int initted_vlc[2];
+    int coder_type, coder_tree, coder_block;
+    int rd_bits; ///< ok so there's no D in RD, but it's the same concept
+    uint8_t quantize_block_context[37];
+};
+
+static uint8_t map_escape[251];
+static uint16_t map_coder3[81];
+static uint32_t map_coder6[1296];
+static uint8_t lut_block_sum[1296];
+
+static av_always_inline uint32_t pack8to32(int a, int b, int c, int d) {
+#if HAVE_BIGENDIAN
+   return (d&0xFF) + ((c&0xFF)<<8) + ((b&0xFF)<<16) + (a<<24);
+#else
+   return (a&0xFF) + ((b&0xFF)<<8) + ((c&0xFF)<<16) + (d<<24);
+#endif
+}
+
+static void lut_init(void) {
+    int i, j, k, l;
+    for(i=0; i<3; i++)
+        for(j=0; j<3; j++)
+            for(k=0; k<3; k++)
+                for(l=0; l<3; l++)
+                    map_coder3[i*3*3*3+j*3*3+k*3+l] = (i<<6) + (j<<4) + (k<<2) + l;
+    for(i=0; i<6; i++)
+        for(j=0; j<6; j++)
+            for(k=0; k<6; k++)
+                for(l=0; l<6; l++) {
+                    map_coder6[i*6*6*6+j*6*6+k*6+l] = (l==5) | (k==5)<<1 | (j==5)<<2 | (i==5)<<3 | pack8to32(l-2, k-2, j-2, i-2)<<4;
+                    lut_block_sum[i*6*6*6+j*6*6+k*6+l] = abs(l-2) + abs(k-2) + abs(j-2) + abs(i-2);
+                }
+    for(i=0; i<251; i++)
+        map_escape[i] = i+3;
+}
+
+static void common_init(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    memset(s, 0, sizeof(FFV2Context));
+    assert(!(avctx->flags&CODEC_FLAG_EMU_EDGE));
+    dsputil_init(&s->dsp, avctx);
+    s->avctx = avctx;
+    s->mb_width = (avctx->width+7)/8;
+    s->mb_height= (avctx->height+7)/8;
+    avctx->get_buffer(avctx, &s->fenc);
+    avctx->get_buffer(avctx, &s->fref);
+    s->tstride = s->fenc.linesize[0];
+    assert(s->tstride >= ((avctx->width+15)&~7));
+    s->temp[0] = av_malloc(s->tstride*10+16);
+    s->temp[1] = av_malloc(s->tstride*10+16);
+    s->temp[2] = av_malloc(s->tstride*10+16);
+    s->temp[3] = av_malloc(s->tstride*10+16);
+    s->mb_types = av_mallocz(s->mb_width+3);
+    s->mvs_base = av_mallocz(6*(s->mb_width+5)*sizeof(int16_t));
+    s->mvs = s->mvs_base+1;
+    s->mvs_top = s->mvs+s->mb_width+5;
+    s->mvps = s->mvs_top+s->mb_width+5;
+    memcpy(s->quantize_block_context, (uint8_t[]){0,0,0,1,1,2,2,3,3,4,4,4,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7}, 37);
+}
+
+static int decode_init(AVCodecContext *avctx) {
+    avctx->pix_fmt= PIX_FMT_YUV420P;
+    common_init(avctx);
+    lut_init();
+    return 0;
+}
+
+static int encode_init(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    int i, j, k;
+    common_init(avctx);
+    s->coder_type = avctx->coder_type;
+    if(s->coder_type == 0)
+        s->coder_type = 26;
+    s->coder_tree = s->coder_type >> 3;
+    s->coder_block = s->coder_type & 7;
+    if((s->coder_type > 3 && s->coder_type < 8) || s->coder_tree < 0 || s->coder_tree > 3 || s->coder_block > 4) {
+        av_log(avctx, AV_LOG_ERROR, "bad coder_type\n");
+        return -1;
+    }
+    s->mv_plane_base = av_mallocz(4*(s->mb_height+2)*(s->mb_width+2)*sizeof(int16_t));
+    s->mv_plane[0] = s->mv_plane_base+s->mb_width+3;
+    s->mv_plane[1] = s->mv_plane[0]+(s->mb_width+2)*(s->mb_height+2);
+    for(k=0; k<2; k++)
+        for(j=0; j<NUM_TABLES; j++)
+            for(i=0; i<NUM_VLCS; i++)
+                s->vlcs[k][j].stats[i] = 1;
+    return 0;
+}
+
+static void common_end(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    avctx->release_buffer(avctx, &s->fenc);
+    avctx->release_buffer(avctx, &s->fref);
+    av_free(s->temp[0]);
+    av_free(s->temp[1]);
+    av_free(s->temp[2]);
+    av_free(s->temp[3]);
+    av_free(s->mb_types);
+    av_free(s->mvs_base);
+}
+
+static int decode_end(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    int i, j;
+    for(j=0; j<2; j++)
+        for(i=0; i<NUM_TABLES; i++)
+            free_vlc(&s->vlcs[j][i].vlc);
+    free_vlc(&s->metavlc.vlc);
+    common_end(avctx);
+    return 0;
+}
+
+static int encode_end(AVCodecContext *avctx) {
+    FFV2Context *s = avctx->priv_data;
+    common_end(avctx);
+    av_free(s->mv_plane_base);
+    return 0;
+}
+
+static inline void put_vlc(FFV2Context *s, VLCS *vlcs, int v) {
+    put_bits(&s->pb, vlcs->len[v], vlcs->bits[v]);
+    vlcs->stats[v]++;
+}
+
+static inline void size_vlc(FFV2Context *s, VLCS *vlcs, int v) {
+    s->rd_bits += vlcs->len[v];
+}
+
+#define RDO_SKIP_BS 0
+#include "ffv2bitstream.c"
+#undef RDO_SKIP_BS
+#define RDO_SKIP_BS 1
+#include "ffv2bitstream.c"
+#undef RDO_SKIP_BS
+
+// FIXME code duplication from huffyuv.c
+// FIXME port optimizations back
+static int generate_bits_table(uint32_t *dst, uint8_t *len_table, int size){
+    int len, index;
+    uint32_t bits=1;
+
+    for(len=1; len<32 && ~bits; len++){
+        for(index=size-1; index>=0; index--){
+            if(len_table[index] == len)
+                dst[index] = bits--;
+        }
+        bits = bits*2+1;
+    }
+    if(~bits){
+        av_log(NULL, AV_LOG_ERROR, "Error generating huffman table\n");
+        return -1;
+    }
+    return 0;
+}
+
+typedef struct {
+    uint64_t val;
+    int name;
+} HeapElem;
+
+static void heap_sift(HeapElem *h, int root, int size)
+{
+    while(root*2+1 < size) {
+        int child = root*2+1;
+        if(child < size-1 && h[child].val > h[child+1].val)
+            child++;
+        if(h[root].val > h[child].val) {
+            FFSWAP(HeapElem, h[root], h[child]);
+            root = child;
+        } else
+            break;
+    }
+}
+
+static void generate_len_table(uint8_t *dst, uint32_t *stats, int size){
+    HeapElem h[size];
+    int up[2*size];
+    int len[2*size];
+    int offset, i, next;
+
+    for(offset=1; ; offset<<=1){
+        for(i=0; i<size; i++){
+            h[i].name = i;
+            h[i].val = ((uint64_t)stats[i] << 8) + offset;
+        }
+        for(i=size/2-1; i>=0; i--)
+            heap_sift(h, i, size);
+
+        for(next=size; next<size*2-1; next++){
+            // merge the two smallest entries, and put it back in the heap
+            uint64_t min1v = h[0].val;
+            up[h[0].name] = next;
+            h[0].val = INT64_MAX;
+            heap_sift(h, 0, size);
+            up[h[0].name] = next;
+            h[0].name = next;
+            h[0].val += min1v;
+            heap_sift(h, 0, size);
+        }
+
+        len[2*size-2] = 0;
+        for(i=2*size-3; i>=size; i--)
+            len[i] = len[up[i]] + 1;
+        for(i=0; i<size; i++) {
+            dst[i] = len[up[i]] + 1;
+            if(dst[i] >= 32) break;
+        }
+        if(i==size) break;
+    }
+}
+
+static int read_table(FFV2Context *s, VLCS* vlcs, int size, void *map, int map_elem, int use_metavlc) {
+    if(!s->fenc.key_frame && !get_bits1(&s->gb))
+        return 0;
+    if(read_len_table(s, vlcs->len, size, use_metavlc))
+        return -1;
+    if(generate_bits_table(vlcs->bits, vlcs->len, size))
+        return -1;
+    free_vlc(&vlcs->vlc);
+    init_vlc_sparse(&vlcs->vlc, VLC_BITS, size, vlcs->len, 1, 1, vlcs->bits, 4, 4, map, map_elem, map_elem, 0);
+    return 0;
+}
+
+static void write_table(FFV2Context *s, VLCS* vlcs, int size, int use_metavlc) {
+    int i;
+    if(s->fenc.key_frame) {
+        generate_len_table(vlcs->len, vlcs->stats, size);
+    } else {
+        uint8_t len[NUM_VLCS];
+        int entropy_old=0, entropy_new=0;
+        int present=0;
+        if(s->gop<4 || !(s->gop&1)) { // skip on some frames to save encoder time
+            generate_len_table(len, vlcs->stats, size);
+            for(i=0; i<size; i++) {
+                entropy_old += vlcs->stats[i] * vlcs->len[i];
+                entropy_new += vlcs->stats[i] * len[i];
+            }
+            s->rd_bits = 0;
+            size_len_table(s, len, size, use_metavlc);
+            entropy_new += s->rd_bits;
+            entropy_new += size; // arbitrary penalty for cputime and possibility of future stats mismatch
+            present = entropy_old - entropy_new > 0;
+        }
+        put_bits(&s->pb, 1, present);
+        if(!present) return;
+        memcpy(vlcs->len, len, size);
+    }
+    generate_bits_table(vlcs->bits, vlcs->len, size);
+    write_len_table(s, vlcs->len, size, use_metavlc);
+    for(i=0; i<size; i++)
+        vlcs->stats[i] = (vlcs->stats[i]+1)>>1;
+}
+
+#define proc_table_sparse(id, size, map, map_elem) {\
+    if(read) err |= read_table(s, &s->vlcs[plane][id], size, map, map_elem, 1);\
+    else write_table(s, &s->vlcs[plane][id], size, 1);\
+}
+#define proc_table(id, size) proc_table_sparse(id, size, NULL, 0)
+
+static int proc_tables(FFV2Context *s, int read, int plane) {
+    int err = 0;
+    int i;
+    if(s->fenc.key_frame && plane==0) {
+        if(read) err |= read_table(s, &s->metavlc, 256, NULL, 0, 0);
+        else write_table(s, &s->metavlc, 256, 0);
+    }
+    for(i=0; i<8; i++)
+        proc_table(i*2+VLC_COEF_BLOCK, 1296);
+    for(i=0; i<8; i++)
+        proc_table_sparse(i*2+VLC_COEF_ESCAPE, 251, map_escape, sizeof(*map_escape));
+    proc_table(VLC_CBP, 256);
+    proc_table(VLC_MV, 256);
+    proc_table_sparse(VLC_MBTYPE, 81, map_coder3, sizeof(*map_coder3));
+    return err;
+}
+
+static void read_lines(FFV2Context *s, uint8_t *buf, uint8_t *sums, int width, int height, int plane) {
+    int x, y;
+    int sstride = s->tstride>>1;
+    for(x=0; x<width; x+=8)
+        for(y=0; y<height; y+=4)
+            read_tree(s, s->vlcs[plane], buf+x+y*s->tstride, sums+(x>>1)+(y>>1)*sstride);
+    memcpy(sums-sstride, sums+sstride*3, sstride);
+}
+
+static void write_lines(FFV2Context *s, uint8_t *buf, int width, int height, int plane) {
+    int x, y;
+    for(x=0; x<width; x+=8)
+        for(y=0; y<height; y+=4)
+            write_tree(s, s->vlcs[plane], buf+x+y*s->tstride);
+}
+
+static int size_mb(FFV2Context *s, uint8_t *buf, int type, int mvdx, int mvdy, int plane) {
+    s->rd_bits = 0;
+    size_tree(s, s->vlcs[plane], buf);
+    size_tree(s, s->vlcs[plane], buf+4*s->tstride);
+    if(type)
+        size_mvd(s, mvdx, mvdy, plane);
+    return s->rd_bits;
+}
+
+static void read_mvs(FFV2Context *s, int mb_width, int plane) {
+    int x, i;
+    for(i=0; i<mb_width; i+=4) {
+        int v = get_vlc2(&s->gb, s->vlcs[plane][VLC_MBTYPE].vlc.table, VLC_BITS, 3);
+        for(x=i; x<FFMIN(i+4,mb_width); x++, v>>=2) {
+            s->mvs[x][0] = mid_pred(s->mvs[x-1][0], s->mvs_top[x][0], s->mvs_top[x+1][0]);
+            s->mvs[x][1] = mid_pred(s->mvs[x-1][1], s->mvs_top[x][1], s->mvs_top[x+1][1]);
+            s->mb_types[x] = (v&3)>0;
+            if((v&3)==2)
+                read_mvd(s, s->mvs[x], plane);
+        }
+    }
+}
+
+static void write_mvs(FFV2Context *s, int mb_width, int plane) {
+    int x, i;
+    for(i=0; i<mb_width; i+=4) {
+        int v = 0;
+        for(x=i+3; x>=i; x--) {
+            v *= 3;
+            v += !s->mb_types[x] ? 0 : *(uint32_t*)s->mvs[x] == *(uint32_t*)s->mvps[x] ? 1 : 2;
+        }
+        put_vlc(s, &s->vlcs[plane][VLC_MBTYPE], v);
+        for(x=i; x<FFMIN(i+4,mb_width); x++) {
+            int dx = s->mvs[x][0]-s->mvps[x][0];
+            int dy = s->mvs[x][1]-s->mvps[x][1];
+            if(dx|dy)
+                write_mvd(s, dx, dy, plane);
+        }
+    }
+}
+
+#define CHECK_MV(x, y) {\
+    int cost = s->dsp.sad[1](NULL, src, ref+(x)+(y)*stride, stride, 8);\
+    COPY3_IF_LT(bcost, cost, bmx, x, bmy, y);\
+}
+
+static void motion_search(FFV2Context *s, uint8_t *src, uint8_t *ref, int stride, int16_t (*mv_plane)[2], int mv_stride, int16_t *mv_out, int *mv_min, int *mv_max) {
+    int x, y, bmx=0, bmy=0, omx, omy;
+    int bcost = s->dsp.sad[1](NULL, src, ref, stride, 8);
+    for(y=-1; y<=1; y++)
+        for(x=-1; x<=1; x++) {
+            int mx = av_clip(mv_plane[x+y*mv_stride][0], mv_min[0]+1, mv_max[0]-1);
+            int my = av_clip(mv_plane[x+y*mv_stride][1], mv_min[1]+1, mv_max[1]-1);
+            if((mx|my) && ((mx-bmx)|(my-bmy)))
+                CHECK_MV(mx, my);
+        }
+    while(1) {
+        omx=bmx; omy=bmy;
+        CHECK_MV(omx-1, omy);
+        CHECK_MV(omx+1, omy);
+        CHECK_MV(omx, omy-1);
+        CHECK_MV(omx, omy+1);
+        if(bmx==omx && bmy==omy)
+            break;
+        if(bmx<=mv_min[0] || bmx>=mv_max[0] || bmy<=mv_min[1] || bmy>=mv_max[1])
+            break;
+    }
+    mv_plane[0][0] = bmx;
+    mv_plane[0][1] = bmy;
+    if(bmx>mv_min[0] && bmx<mv_max[0] && bmy>mv_min[1] && bmy<mv_max[1]) {
+        uint8_t *r = ref+bmx+bmy*stride;
+        int cost;
+        bmx <<= 1;
+        bmy <<= 1;
+        omx=bmx; omy=bmy;
+        cost = s->dsp.pix_abs[1][1](NULL, src, r-1, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx-1, bmy, omy);
+        cost = s->dsp.pix_abs[1][1](NULL, src, r, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx+1, bmy, omy);
+        cost = s->dsp.pix_abs[1][2](NULL, src, r-stride, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx, bmy, omy-1);
+        cost = s->dsp.pix_abs[1][2](NULL, src, r, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx, bmy, omy+1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r-stride-1, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx-1, bmy, omy-1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r-stride, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx+1, bmy, omy-1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r-1, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx-1, bmy, omy+1);
+        cost = s->dsp.pix_abs[1][3](NULL, src, r, stride, 8);
+        COPY3_IF_LT(bcost, cost, bmx, omx+1, bmy, omy+1);
+    }
+    mv_out[0] = bmx;
+    mv_out[1] = bmy;
+}
+
+static void defilter_lines(FFV2Context *s, uint8_t *dst, uint8_t *src, uint8_t *ref, int stride, int width, int height) {
+    int x, y, i;
+    for(y=0; y<8; y++)
+        *(uint16_t*)&dst[-2+y*stride] = dst[width+y*stride] = 0;
+    if(s->fenc.key_frame) {
+        for(y=0; y<height; y++, dst+=stride, src+=stride)
+            add_ffv2_median_prediction(dst, src, stride, width);
+    } else {
+        for(y=0; y<height; y++, dst+=stride, src+=stride, ref+=stride) {
+            for(x=0; x<width; x+=8) {
+                if(!s->mb_types[x>>3]) {
+                    // filter an intra block, with decoded samples as neighbors.
+                    // then compute what would have been the inter residual, and write that back to the residual buffer.
+                    add_ffv2_median_prediction(dst+x, src+x, stride, 8);
+                    for(i=0; i<8; i++)
+                        src[x+i] = dst[x+i] - ref[x+i];
+                } else {
+                    // filter an inter block, with inter residuals as neighbors, and add inter prediction.
+                    add_ffv2_median_prediction(src+x, src+x, stride, 8);
+                    for(i=0; i<8; i++)
+                        dst[x+i] = src[x+i] + ref[x+i];
+                }
+            }
+        }
+    }
+}
+
+static void enfilter_lines(FFV2Context *s, uint8_t *dst, uint8_t *src, int stride, int width, int height) {
+    int y;
+    for(y=0; y<8; y++)
+        *(uint16_t*)&src[-2+y*stride] = src[width+y*stride] = 0;
+    for(y=0; y<height; y++) {
+        uint8_t *pd = dst + y*stride;
+        uint8_t *ps = src + y*stride;
+        sub_ffv2_median_prediction(pd, ps, stride, width);
+    }
+    if(height < 8)
+        memset(dst+height*stride, 0, (8-height)*stride);
+    if(width & 7)
+        for(y=0; y<8; y++)
+            memset(dst+y*stride+width, 0, 8-(width&7));
+}
+
+static void enfilter_pblock(FFV2Context *s, uint8_t *dst, uint8_t *src, int stride) {
+    int y;
+    for(y=0; y<8; y++, dst+=stride, src+=stride)
+        sub_ffv2_median_prediction(dst, src, stride, 8);
+}
+
+static int decode_plane(FFV2Context *s, int plane) {
+    int stride = s->fenc.linesize[plane];
+    int width = s->avctx->width >> !!plane;
+    int height = s->avctx->height >> !!plane;
+    int mb_width = (width+7)>>3;
+    int mb_height = (height+7)>>3;
+    uint8_t *residual = s->temp[0] + 2*stride + 8;
+    uint8_t *pmc      = s->temp[1] + 2*stride + 8;
+    uint8_t *sums     = s->temp[2] + 2*stride + 8;
+    int x, y;
+    s->tstride = stride;
+
+    init_get_bits(&s->gb, s->bs, (s->bs_end - s->bs)*8);
+    if(plane < 2)
+        if(proc_tables(s, 1, plane) < 0)
+            return -1;
+
+    if(!s->fenc.key_frame)
+        s->dsp.draw_edges(s->fref.data[plane], stride, width, height, 8);
+    memset(residual-2*stride-8, 0, 10*stride+8);
+    memset(sums-stride-8, 0, 3*stride+8); // a bit overkill
+    memset(s->mvs_base, 0, 6*(s->mb_width+5)*sizeof(int16_t));
+    memset(s->mb_types, 0, mb_width+3);
+    memset(s->fenc.data[plane]-2*stride, 0, 2*stride);
+
+    for(y=0; y<height; y+=8) {
+        uint8_t *dst = s->fenc.data[plane] + y*stride;
+        uint8_t *ref = s->fref.data[plane] + y*stride;
+        if(s->fenc.key_frame) {
+            read_lines(s, residual, sums, width, FFMIN(8,height-y), !!plane);
+            defilter_lines(s, dst, residual, NULL, stride, width, FFMIN(8,height-y));
+        } else {
+            read_mvs(s, mb_width, !!plane);
+            for(x=0; x<mb_width; x++) {
+                int mvx, mvy;
+                mvx = av_clip(s->mvs[x][0], -16*x-16, (mb_width-x)*16);
+                mvy = av_clip(s->mvs[x][1], -2*y-16, mb_height*16-2*y);
+                s->dsp.put_pixels_tab[1][(mvx&1)+2*(mvy&1)](pmc+8*x, ref+8*x+(mvx>>1)+(mvy>>1)*stride, stride, 8);
+            }
+            read_lines(s, residual, sums, width, FFMIN(8,height-y), !!plane);
+            defilter_lines(s, dst, residual, pmc, stride, width, FFMIN(8,height-y));
+            memcpy(residual-2*stride, residual+6*stride, 2*stride);
+            FFSWAP(void*, s->mvs, s->mvs_top);
+        }
+    }
+
+    s->bs += (get_bits_count(&s->gb)+7)>>3;
+    return 0;
+}
+
+static void encode_plane(FFV2Context *s, int plane) {
+    int stride = s->fenc.linesize[plane];
+    int width = s->avctx->width >> !!plane;
+    int height = s->avctx->height >> !!plane;
+    int mb_width = (width+7)>>3;
+    int mb_height = (height+7)>>3;
+    int mv_stride = mb_width+2;
+    uint8_t *imed  = s->temp[0] + 2*stride + 8; ///< median-filtered intra samples
+    uint8_t *pmc   = s->temp[1] + 2*stride + 8; ///< inter prediction
+    uint8_t *pdiff = s->temp[2] + 2*stride + 8; ///< inter residual
+    uint8_t *pmed  = s->temp[3] + 2*stride + 8; ///< median-filtered inter residual
+    int x, y;
+    s->tstride = stride;
+
+    init_put_bits(&s->pb, s->bs, (s->bs_end - s->bs)*8);
+    if(plane < 2)
+        proc_tables(s, 0, plane);
+
+    if(!s->fenc.key_frame)
+        s->dsp.draw_edges(s->fref.data[plane], stride, width, height, 8);
+    memset(imed-2*stride-8, 0, 10*stride+8);
+    memset(pdiff-2*stride-8, 0, 10*stride+8);
+    memset(s->mvs_base, 0, 6*(s->mb_width+5)*sizeof(int16_t));
+    memset(s->mb_types, 0, mb_width+3);
+    memset(s->fenc.data[plane]-2*stride, 0, 2*stride);
+
+    for(y=0; y<height; y+=8) {
+        uint8_t *src = s->fenc.data[plane] + y*stride;
+        uint8_t *ref = s->fref.data[plane] + y*stride;
+        enfilter_lines(s, imed, src, stride, width, FFMIN(8,height-y));
+        if(!s->fenc.key_frame) {
+            int16_t (*mv_plane)[2] = s->mv_plane[!!plane] + (y>>3)*mv_stride;
+            int mvx, mvy, mvpx, mvpy, isad, psad, type;
+            int mv_min[2] = {0, -y-8};
+            int mv_max[2] = {0, mb_height*8-y};
+            for(x=0; x<mb_width; x++) {
+                mv_min[0] = -8*x-8;
+                mv_max[0] = (mb_width-x)*8;
+                mvpx = s->mvps[x][0] = mid_pred(s->mvs[x-1][0], s->mvs_top[x][0], s->mvs_top[x+1][0]);
+                mvpy = s->mvps[x][1] = mid_pred(s->mvs[x-1][1], s->mvs_top[x][1], s->mvs_top[x+1][1]);
+                motion_search(s, src+8*x, ref+8*x, stride, mv_plane, mv_stride, s->mvs[x], mv_min, mv_max);
+                mvx = s->mvs[x][0];
+                mvy = s->mvs[x][1];
+                s->dsp.put_pixels_tab[1][(mvx&1)+2*(mvy&1)](pmc+8*x, ref+8*x+(mvx>>1)+(mvy>>1)*stride, stride, 8);
+                sub_block(pdiff+8*x, src+8*x, pmc+8*x, stride);
+                if(x<mb_width-1) // FIXME only needs 1 column
+                    sub_block(pdiff+8*x+8, src+8*x+8, ref+FFMIN(width,8+8*x+(mvx>>1))+(mvy>>1)*stride, stride);
+                enfilter_pblock(s, pmed+8*x, pdiff+8*x, stride);
+                if(s->avctx->mb_decision) {
+                    isad = size_mb(s, imed+8*x, 0, 0, 0, !!plane);
+                    psad = size_mb(s, pmed+8*x, 1, mvx-mvpx, mvy-mvpy, !!plane);
+                } else {
+                    isad = sum_abs_int8(imed+8*x, stride);
+                    psad = sum_abs_int8(pmed+8*x, stride);
+                }
+                type = s->mb_types[x] = psad < isad;
+                if(!type) {
+                    s->mvs[x][0] = mvpx;
+                    s->mvs[x][1] = mvpy;
+                    mvx = av_clip(mvpx, 2*mv_min[0], 2*mv_max[0]);
+                    mvy = av_clip(mvpy, 2*mv_min[1], 2*mv_max[1]);
+                    s->dsp.put_pixels_tab[1][(mvx&1)+2*(mvy&1)](pmc+8*x, ref+8*x+(mvx>>1)+(mvy>>1)*stride, stride, 8);
+                    sub_block(pdiff+8*x, src+8*x, pmc+8*x, stride);
+                }
+            }
+            for(x=0; x<mb_width; x++)
+                if(s->mb_types[x]) // FIXME only needs to fixup the rightmost column
+                    enfilter_pblock(s, imed+8*x, pdiff+8*x, stride);
+            write_mvs(s, mb_width, !!plane);
+            memcpy(pdiff-2*stride, pdiff+6*stride, 2*stride);
+            FFSWAP(void*, s->mvs, s->mvs_top);
+        }
+        write_lines(s, imed, width, FFMIN(8,height-y), !!plane);
+        memcpy(imed-2*stride, imed+6*stride, 2*stride);
+    }
+
+    s->bs += (put_bits_count(&s->pb)+7)>>3;
+    flush_put_bits(&s->pb);
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt) {
+    uint8_t *buf = avpkt->data;
+    int buf_size = avpkt->size;
+    FFV2Context *s = avctx->priv_data;
+    int err = 0;
+    s->bs = buf;
+    s->bs_end = buf + buf_size;
+
+    if(s->bs[0]) {
+        s->fenc.key_frame = 1;
+        s->fenc.pict_type = FF_I_TYPE;
+        s->coder_type = s->bs[0];
+        s->coder_tree = s->coder_type >> 3;
+        s->coder_block = s->coder_type & 7;
+    } else {
+        FFSWAP(AVFrame, s->fenc, s->fref);
+        s->fenc.key_frame = 0;
+        s->fenc.pict_type = FF_P_TYPE;
+    }
+    s->bs++;
+
+    err |= decode_plane(s, 0);
+    err |= decode_plane(s, 1);
+    err |= decode_plane(s, 2);
+
+    *(AVFrame*)data = s->fenc;
+    *data_size = sizeof(s->fenc);
+    return err ? err : s->bs - buf;
+}
+
+static int encode_frame(AVCodecContext *avctx, uint8_t *buf, int buf_size, void *data) {
+    FFV2Context *s = avctx->priv_data;
+    s->bs = buf;
+    s->bs_end = buf + buf_size;
+
+    if(s->gop >= avctx->gop_size)
+        s->gop = 0;
+    if(s->gop) {
+        FFSWAP(AVFrame, s->fenc, s->fref);
+    }
+    s->fenc.key_frame = !s->gop;
+    s->fenc.pict_type = s->fenc.key_frame ? FF_I_TYPE : FF_P_TYPE;
+    *(s->bs++) = s->fenc.key_frame ? s->coder_type : 0;
+
+    // FIXME skip in intra-only mode
+    av_picture_copy((AVPicture*)&s->fenc, (AVPicture*)data, PIX_FMT_YUV420P, avctx->width, avctx->height);
+
+    encode_plane(s, 0);
+    encode_plane(s, 1);
+    encode_plane(s, 2);
+
+    if(!s->initted_vlc[s->fenc.key_frame]) {
+        // re-encode with updated vlc
+        // FIXME do this whenever stats change enough?
+        s->bs = buf+1;
+        encode_plane(s, 0);
+        encode_plane(s, 1);
+        encode_plane(s, 2);
+        s->initted_vlc[s->fenc.key_frame] = 1;
+    }
+
+    s->gop++;
+    avctx->coded_frame = &s->fenc;
+    if(s->bs_end - s->bs < 4)
+        return -1;
+    return s->bs - buf;
+}
+
+AVCodec ffv2_decoder = {
+    "ffv2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFV2,
+    sizeof(FFV2Context),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    CODEC_CAP_DR1,
+    NULL,
+};
+
+AVCodec ffv2_encoder = {
+    "ffv2",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_FFV2,
+    sizeof(FFV2Context),
+    encode_init,
+    encode_frame,
+    encode_end,
+    .pix_fmts = (enum PixelFormat[]){PIX_FMT_YUV420P},
+};
+
diff --git a/libavcodec/ffv2bitstream.c b/libavcodec/ffv2bitstream.c
new file mode 100644
index 0000000..f9a228c
--- /dev/null
+++ b/libavcodec/ffv2bitstream.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2010 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// This file is compiled twice, to provide the bitstream writer and
+// corresponding functions that just count how many bits would be written.
+// The reader doesn't need to be duplicated, but is here just so that it's
+// next to the writer.
+
+#if RDO_SKIP_BS
+#define put_bits(pb,n,x) s->rd_bits += n
+#define put_vlc          size_vlc
+#define write_len_table  size_len_table
+#define write_mvd        size_mvd
+#define write_block      size_block
+#define write_tree       size_tree
+#endif
+
+#if !RDO_SKIP_BS
+// TODO: gradient prediction? predict from the previous contents of the same table?
+static int read_len_table(FFV2Context *s, uint8_t *len, int size, int use_metavlc){
+    int val=1, repeat=-1, code, i;
+
+    for(i=0; i<size;){
+        code = use_metavlc ? get_vlc2(&s->gb, s->metavlc.vlc.table, VLC_BITS, 3) : get_bits(&s->gb, 8);
+        repeat = code >> 5;
+        val = (val+code) & 31;
+        if(repeat==0)
+            repeat = get_bits(&s->gb, 8) + 8;
+        while(repeat-- && i<size)
+            len[i++] = val;
+    }
+    if(repeat != -1) {
+        av_log(NULL, AV_LOG_ERROR, "Error reading huffman table\n");
+        return -1;
+    }
+    return 0;
+}
+#endif //!RDO_SKIP_BS
+
+static void write_len_table(FFV2Context *s, uint8_t *len, int size, int use_metavlc){
+    int prev_val=1, val, repeat, code, i;
+
+    for(i=0; i<size;){
+        val = len[i];
+        repeat = 0;
+
+        for(; i<size && len[i]==val && repeat<263; i++)
+            repeat++;
+        assert(val<32 && val>0 && repeat<264 && repeat>0);
+
+        code = (val - prev_val) & 31;
+        prev_val = val;
+        if(repeat<8)
+            code |= repeat<<5;
+        if(use_metavlc)
+            put_vlc(s, &s->metavlc, code);
+        else
+            put_bits(&s->pb, 8, code);
+        if(repeat>=8)
+            put_bits(&s->pb, 8, repeat-8);
+    }
+}
+
+#if !RDO_SKIP_BS
+static void read_mvd(FFV2Context *s, int16_t *mv, int plane) {
+    VLCS *vlcs = &s->vlcs[plane][VLC_MV];
+    int v = get_vlc2(&s->gb, vlcs->vlc.table, VLC_BITS, 3);
+    int d;
+    mv[0] += (v&15)<15 ? (v&15)-7
+           : (d=(int8_t)get_bits(&s->gb, 8)) != -128 ? d
+           : (int16_t)get_bits(&s->gb, 16);
+    v >>= 4;
+    mv[1] += (v&15)<15 ? (v&15)-7
+           : (d=(int8_t)get_bits(&s->gb, 8)) != -128 ? d
+           : (int16_t)get_bits(&s->gb, 16);
+}
+#endif //!RDO_SKIP_BS
+
+static void write_mvd(FFV2Context *s, int mvdx, int mvdy, int plane) {
+    VLCS *vlcs = &s->vlcs[plane][VLC_MV];
+    int x = FFMIN(15, (unsigned)(mvdx+7));
+    int y = FFMIN(15, (unsigned)(mvdy+7));
+    int v = x+(y<<4);
+    put_vlc(s, vlcs, v);
+    if(x==15) {
+        if((unsigned)(mvdx+127) < 255)
+            put_bits(&s->pb, 8, mvdx&0xff);
+        else
+            put_bits(&s->pb, 24, (128<<16)+(mvdx&0xffff));
+    }
+    if(y==15) {
+        if((unsigned)(mvdy+127) < 255)
+            put_bits(&s->pb, 8, mvdy&0xff);
+        else
+            put_bits(&s->pb, 24, (128<<16)+(mvdy&0xffff));
+    }
+}
+
+#if !RDO_SKIP_BS
+static int block_context(int8_t *residual, int stride) {
+    // TODO simd or lut
+    int sum = 0;
+    int x,y;
+    for(y=-2; y<0; y++)
+        for(x=-2; x<2; x++)
+            sum += FFMIN(3,abs(residual[x+y*stride]));
+    for(y=0; y<2; y++)
+        for(x=-2; x<0; x++)
+            sum += FFMIN(3,abs(residual[x+y*stride]));
+    return sum;
+}
+
+static av_always_inline void read_escape(FFV2Context *s, VLCS *vlcs, int8_t *residual, int flags) {
+    if(flags&1) residual[0]            = get_vlc2(&s->gb, vlcs[VLC_COEF_ESCAPE].vlc.table, VLC_BITS, 3);
+    if(flags&2) residual[1]            = get_vlc2(&s->gb, vlcs[VLC_COEF_ESCAPE].vlc.table, VLC_BITS, 3);
+    if(flags&4) residual[s->tstride]   = get_vlc2(&s->gb, vlcs[VLC_COEF_ESCAPE].vlc.table, VLC_BITS, 3);
+    if(flags&8) residual[s->tstride+1] = get_vlc2(&s->gb, vlcs[VLC_COEF_ESCAPE].vlc.table, VLC_BITS, 3);
+}
+
+#define DECL_READ_ESCAPE(flags) \
+static void read_escape##flags(FFV2Context *s, VLCS *vlcs, int8_t *residual) {\
+    read_escape(s, vlcs, residual, flags);\
+}
+#define NAME_READ_ESCAPE(flags) read_escape##flags,
+#define OP16(op) op(0) op(1) op(2) op(3) op(4) op(5) op(6) op(7) op(8) op(9) op(10) op(11) op(12) op(13) op(14) op(15)
+OP16(DECL_READ_ESCAPE)
+static void (*read_escape_tab[16])(FFV2Context *s, VLCS *vlcs, int8_t *residual) = { OP16(NAME_READ_ESCAPE) };
+
+static void read_block(FFV2Context *s, VLCS *vlcs, int8_t *residual, uint8_t *sums) {
+    int code, v, ctx;
+    ctx = sums[-1] + sums[-(s->tstride>>1)] + sums[-1-(s->tstride>>1)];
+    ctx = s->quantize_block_context[ctx];
+    vlcs += 2*ctx;
+    code = get_vlc2(&s->gb, vlcs[VLC_COEF_BLOCK].vlc.table, VLC_BITS, 3);
+    sums[0] = lut_block_sum[code];
+    v = map_coder6[code];
+    AV_WN16(residual, v>>4);
+    AV_WN16(residual+s->tstride, v>>20);
+    read_escape_tab[v&15](s, vlcs, residual);
+}
+
+static void read_tree(FFV2Context *s, VLCS *vlcs, uint8_t *residual, uint8_t *sums) {
+    int cbp;
+    int bstride = s->tstride<<1;
+    int sstride = s->tstride>>1;
+    fill_rectangle(residual, 4, 4, sstride, 0, 2);
+    fill_rectangle(sums, 4, 2, sstride, 0, 1);
+    cbp = get_vlc2(&s->gb, vlcs[VLC_CBP].vlc.table, VLC_BITS, 3);
+    if(cbp&0x01) read_block(s, vlcs, residual+0,         sums+0);
+    if(cbp&0x02) read_block(s, vlcs, residual+2,         sums+1);
+    if(cbp&0x04) read_block(s, vlcs, residual+4,         sums+2);
+    if(cbp&0x08) read_block(s, vlcs, residual+6,         sums+3);
+    if(cbp&0x10) read_block(s, vlcs, residual+bstride+0, sums+sstride+0);
+    if(cbp&0x20) read_block(s, vlcs, residual+bstride+2, sums+sstride+1);
+    if(cbp&0x40) read_block(s, vlcs, residual+bstride+4, sums+sstride+2);
+    if(cbp&0x80) read_block(s, vlcs, residual+bstride+6, sums+sstride+3);
+}
+#endif //!RDO_SKIP_BS
+
+static void write_block(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    uint8_t a = 2+residual[0];
+    uint8_t b = 2+residual[1];
+    uint8_t c = 2+residual[s->tstride];
+    uint8_t d = 2+residual[s->tstride+1];
+    int v = FFMIN(a,5) + 6*FFMIN(b,5) + 36*FFMIN(c,5) + 216*FFMIN(d,5);
+    vlcs += 2*s->quantize_block_context[block_context(residual, s->tstride)];
+    put_vlc(s, &vlcs[VLC_COEF_BLOCK], v);
+    if(a>=5) put_vlc(s, &vlcs[VLC_COEF_ESCAPE], a-5);
+    if(b>=5) put_vlc(s, &vlcs[VLC_COEF_ESCAPE], b-5);
+    if(c>=5) put_vlc(s, &vlcs[VLC_COEF_ESCAPE], c-5);
+    if(d>=5) put_vlc(s, &vlcs[VLC_COEF_ESCAPE], d-5);
+}
+
+static void write_tree(FFV2Context *s, VLCS *vlcs, uint8_t *residual) {
+    int cbp, i;
+    cbp = cbp_8x4_raster(residual, s->tstride);
+    put_vlc(s, &vlcs[VLC_CBP], cbp);
+    for(i=0; cbp; i++, cbp>>=1)
+        if(cbp&1)
+            write_block(s, vlcs, residual+2*(i&3)+2*(i>>2)*s->tstride);
+}
+
+#undef put_bits
+#undef put_vlc
+#undef write_len_table
+#undef write_mvd
+#undef write_block
+#undef write_tree
diff --git a/libavcodec/ffv2dsp.c b/libavcodec/ffv2dsp.c
new file mode 100644
index 0000000..858799b
--- /dev/null
+++ b/libavcodec/ffv2dsp.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2010 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static int cbp_8x4_raster(uint8_t *res, int stride) {
+    int cbp;
+    cbp  = !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 0;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 1;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 2;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 3;
+    res += 2*stride;
+    cbp |= !!(*(uint16_t*)(res+0) | *(uint16_t*)(res+stride+0)) << 4;
+    cbp |= !!(*(uint16_t*)(res+2) | *(uint16_t*)(res+stride+2)) << 5;
+    cbp |= !!(*(uint16_t*)(res+4) | *(uint16_t*)(res+stride+4)) << 6;
+    cbp |= !!(*(uint16_t*)(res+6) | *(uint16_t*)(res+stride+6)) << 7;
+    return cbp;
+}
+
+static int sum_abs_int8(int8_t *buf, intptr_t stride) {
+    int sum=0;
+#if HAVE_MMX
+    DECLARE_ASM_CONST(8, uint64_t, pb_128) = 0x8080808080808080ULL;
+    __asm__ volatile(
+        "movq  %4,        %%mm7 \n"
+        "movq  (%0),      %%mm0 \n"
+        "movq  (%0,%2),   %%mm1 \n"
+        "movq  (%0,%2,2), %%mm2 \n"
+        "movq  (%0,%3),   %%mm3 \n"
+        "lea   (%0,%2,4), %0    \n"
+        "paddb     %%mm7, %%mm0 \n"
+        "paddb     %%mm7, %%mm1 \n"
+        "paddb     %%mm7, %%mm2 \n"
+        "paddb     %%mm7, %%mm3 \n"
+        "psadbw    %%mm7, %%mm0 \n"
+        "psadbw    %%mm7, %%mm1 \n"
+        "psadbw    %%mm7, %%mm2 \n"
+        "psadbw    %%mm7, %%mm3 \n"
+        "paddw     %%mm2, %%mm0 \n"
+        "paddw     %%mm3, %%mm1 \n"
+        "movq  (%0),      %%mm4 \n"
+        "movq  (%0,%2),   %%mm5 \n"
+        "movq  (%0,%2,2), %%mm2 \n"
+        "movq  (%0,%3),   %%mm3 \n"
+        "paddb     %%mm7, %%mm4 \n"
+        "paddb     %%mm7, %%mm5 \n"
+        "paddb     %%mm7, %%mm2 \n"
+        "paddb     %%mm7, %%mm3 \n"
+        "psadbw    %%mm7, %%mm4 \n"
+        "psadbw    %%mm7, %%mm5 \n"
+        "psadbw    %%mm7, %%mm2 \n"
+        "psadbw    %%mm7, %%mm3 \n"
+        "paddw     %%mm4, %%mm0 \n"
+        "paddw     %%mm5, %%mm1 \n"
+        "paddw     %%mm2, %%mm0 \n"
+        "paddw     %%mm3, %%mm1 \n"
+        "paddw     %%mm1, %%mm0 \n"
+        "movd      %%mm0, %1    \n"
+        :"+&r"(buf), "=r"(sum)
+        :"r"(stride), "r"(stride*3), "m"(pb_128)
+    );
+#else
+    int x, y;
+    for(y=0; y<8; y++, buf+=stride)
+        for(x=0; x<8; x++)
+            sum += abs(buf[x]);
+#endif
+    return sum;
+}
+
+static void sub_block(uint8_t *dst, uint8_t *src, uint8_t *ref, intptr_t stride) {
+    __asm__ volatile(
+        "movq  (%1),      %%mm0 \n"
+        "movq  (%1,%3),   %%mm1 \n"
+        "movq  (%1,%3,2), %%mm2 \n"
+        "movq  (%1,%4),   %%mm3 \n"
+        "psubb (%2),      %%mm0 \n"
+        "psubb (%2,%3),   %%mm1 \n"
+        "psubb (%2,%3,2), %%mm2 \n"
+        "psubb (%2,%4),   %%mm3 \n"
+        "movq  %%mm0, (%0)      \n"
+        "movq  %%mm1, (%0,%3)   \n"
+        "movq  %%mm2, (%0,%3,2) \n"
+        "movq  %%mm3, (%0,%4)   \n"
+        "lea   (%0,%3,4), %0    \n"
+        "lea   (%1,%3,4), %1    \n"
+        "lea   (%2,%3,4), %2    \n"
+        "movq  (%1),      %%mm0 \n"
+        "movq  (%1,%3),   %%mm1 \n"
+        "movq  (%1,%3,2), %%mm2 \n"
+        "movq  (%1,%4),   %%mm3 \n"
+        "psubb (%2),      %%mm0 \n"
+        "psubb (%2,%3),   %%mm1 \n"
+        "psubb (%2,%3,2), %%mm2 \n"
+        "psubb (%2,%4),   %%mm3 \n"
+        "movq  %%mm0, (%0)      \n"
+        "movq  %%mm1, (%0,%3)   \n"
+        "movq  %%mm2, (%0,%3,2) \n"
+        "movq  %%mm3, (%0,%4)   \n"
+        :"+&r"(dst), "+&r"(src), "+&r"(ref)
+        :"r"(stride), "r"(stride*3)
+        :"memory"
+    );
+}
+
+static inline int16_t median5(int a, int b, int c, int d, int e) {
+#if 0 // C
+    if(b>c) FFSWAP(int, b, c);
+    if(c>d) FFSWAP(int, c, d);
+    if(a>b) FFSWAP(int, a, b);
+    if(b>c) FFSWAP(int, b, c);
+    if(d>e) d = e;
+    if(a>b) b = a;
+    if(c>d) c = d;
+    if(b>c) c = b;
+}
+#elif 0 // AMD
+    int i;
+#define CMPXCHG(a,b)\
+        "cmp   "#b","#a"\n"\
+        "mov   "#b", %0 \n"\
+        "cmovg "#a","#b"\n"\
+        "cmovg  %0 ,"#a"\n"
+    __asm__ volatile(
+        CMPXCHG(%2,%3)
+        CMPXCHG(%3,%4)
+        CMPXCHG(%1,%2)
+        CMPXCHG(%2,%3)
+        "cmp    %5, %4 \n"
+        "cmovg  %5, %4 \n"
+        "cmp    %1, %2 \n"
+        "cmovl  %1, %2 \n"
+        "cmp    %4, %3 \n"
+        "cmovg  %4, %3 \n"
+        "cmp    %2, %3 \n"
+        "cmovl  %2, %3 \n"
+        :"=&r"(i), "+&r"(a), "+&r"(b), "+&r"(c), "+&r"(d)
+        :"r"(e)
+    );
+#else // Intel, int16
+#define CMPXCHG(a,b)\
+        "movq    "#a", %%mm0 \n"\
+        "pminsw  "#b", "#a"  \n"\
+        "pmaxsw %%mm0, "#b"  \n"
+    __asm__ volatile(
+        "movd      %1, %%mm1 \n"
+        "movd      %2, %%mm2 \n"
+        "movd      %3, %%mm3 \n"
+        "movd      %4, %%mm4 \n"
+        "movd      %5, %%mm5 \n"
+        CMPXCHG(%%mm2, %%mm3)
+        CMPXCHG(%%mm3, %%mm4)
+        CMPXCHG(%%mm1, %%mm2)
+        CMPXCHG(%%mm2, %%mm3)
+        "pminsw %%mm5, %%mm4 \n"
+        "pmaxsw %%mm1, %%mm2 \n"
+        "pminsw %%mm4, %%mm3 \n"
+        "pmaxsw %%mm2, %%mm3 \n"
+        "movd   %%mm3, %0    \n"
+        :"=r"(c)
+        :"r"(a), "r"(b), "r"(c), "r"(d), "r"(e)
+    );
+#endif
+#undef CMPXCHG
+    return c;
+}
+
+
+// median(l, tr, l+t-tl) except with l subtracted off each element to minimize overflows
+
+static void add_ffv2_median_prediction(int8_t *dst, int8_t *residual, int stride, int width) {
+    int x;
+    int l  = dst[-1];
+    int tl = dst[-1-stride];
+    int t  = dst[-stride];
+    int8_t *top = dst-2*stride;
+    for(x=0; x<width; x++) {
+        int tr = top[x+1+stride];
+        int v = l + mid_pred(0, (int8_t)(tr-l), (int8_t)(t-tl)) + residual[x];
+        dst[x] = v;
+        l  = (int8_t)v;
+        tl = t;
+        t  = tr;
+    }
+}
+
+static void sub_ffv2_median_prediction(int8_t *dst, int8_t *src, int stride, int width) {
+    int x;
+    int l  = src[-1];
+    int tl = src[-1-stride];
+    int t  = src[-stride];
+    int8_t *top = src-2*stride;
+    for(x=0; x<width; x++) {
+        int tr = top[x+1+stride];
+        int v = top[x+2*stride];
+        dst[x] = v - l - mid_pred(0, (int8_t)(tr-l), (int8_t)(t-tl));
+        l  = v;
+        tl = t;
+        t  = tr;
+    }
+}
diff --git a/libavformat/riff.c b/libavformat/riff.c
index 8db684a..17d4368 100644
--- a/libavformat/riff.c
+++ b/libavformat/riff.c
@@ -184,6 +184,7 @@ const AVCodecTag ff_codec_bmp_tags[] = {
     { CODEC_ID_ASV2,         MKTAG('A', 'S', 'V', '2') },
     { CODEC_ID_VCR1,         MKTAG('V', 'C', 'R', '1') },
     { CODEC_ID_FFV1,         MKTAG('F', 'F', 'V', '1') },
+    { CODEC_ID_FFV2,         MKTAG('F', 'F', 'V', '2') },
     { CODEC_ID_XAN_WC4,      MKTAG('X', 'x', 'a', 'n') },
     { CODEC_ID_MIMIC,        MKTAG('L', 'M', '2', '0') },
     { CODEC_ID_MSRLE,        MKTAG('m', 'r', 'l', 'e') },