diff --git a/data.go b/data.go
index 4e6c52869f686bea717bfc1eee2874af4eeb9379..c9412413f58d32e4d946c4890b3148e9deb86992 100644
--- a/data.go
+++ b/data.go
@@ -45,7 +45,7 @@ func (td *TrianglesData) SetLen(len int) {
 				Color     RGBA
 				Picture   Vec
 				Intensity float64
-			}{ZV, Alpha(1), ZV, 0})
+			}{Color: RGBA{1, 1, 1, 1}})
 		}
 	}
 	if len < td.Len() {
diff --git a/geometry.go b/geometry.go
index f9348398a5e580db38b180683cea2fb708829623..928c9c5f089573b17da46d2757704845654f0496 100644
--- a/geometry.go
+++ b/geometry.go
@@ -3,8 +3,6 @@ package pixel
 import (
 	"fmt"
 	"math"
-
-	"github.com/go-gl/mathgl/mgl64"
 )
 
 // Vec is a 2D vector type with X and Y coordinates.
@@ -251,7 +249,7 @@ func (r Rect) Union(s Rect) Rect {
 	)
 }
 
-// Matrix is a 3x3 transformation matrix that can be used for all kinds of spacial transforms, such
+// Matrix is a 3x2 affine matrix that can be used for all kinds of spatial transforms, such
 // as movement, scaling and rotations.
 //
 // Matrix has a handful of useful methods, each of which adds a transformation to the matrix. For
@@ -261,38 +259,41 @@ func (r Rect) Union(s Rect) Rect {
 //
 // This code creates a Matrix that first moves everything by 100 units horizontally and 200 units
 // vertically and then rotates everything by 90 degrees around the origin.
-type Matrix [9]float64
+//
+// Layout is:
+// [0] [2] [4]
+// [1] [3] [5]
+//  0   0   1  [implicit row]
+type Matrix [6]float64
 
 // IM stands for identity matrix. Does nothing, no transformation.
-var IM = Matrix(mgl64.Ident3())
+var IM = Matrix{1, 0, 0, 1, 0, 0}
 
 // String returns a string representation of the Matrix.
 //
 //   m := pixel.IM
-//   fmt.Println(m) // Matrix(1 0 0 | 0 1 0 | 0 0 1)
+//   fmt.Println(m) // Matrix(1 0 0 | 0 1 0)
 func (m Matrix) String() string {
 	return fmt.Sprintf(
-		"Matrix(%v %v %v | %v %v %v | %v %v %v)",
-		m[0], m[3], m[6],
-		m[1], m[4], m[7],
-		m[2], m[5], m[8],
+		"Matrix(%v %v %v | %v %v %v)",
+		m[0], m[2], m[4],
+		m[1], m[3], m[5],
 	)
 }
 
 // Moved moves everything by the delta vector.
 func (m Matrix) Moved(delta Vec) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Translate2D(delta.XY()).Mul3(m3)
-	return Matrix(m3)
+	m[4], m[5] = m[4]+delta.X, m[5]+delta.Y
+	return m
 }
 
 // ScaledXY scales everything around a given point by the scale factor in each axis respectively.
 func (m Matrix) ScaledXY(around Vec, scale Vec) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Translate2D(around.Scaled(-1).XY()).Mul3(m3)
-	m3 = mgl64.Scale2D(scale.XY()).Mul3(m3)
-	m3 = mgl64.Translate2D(around.XY()).Mul3(m3)
-	return Matrix(m3)
+	m[4], m[5] = m[4]-around.X, m[5]-around.Y
+	m[0], m[2], m[4] = m[0]*scale.X, m[2]*scale.X, m[4]*scale.X
+	m[1], m[3], m[5] = m[1]*scale.Y, m[3]*scale.Y, m[5]*scale.Y
+	m[4], m[5] = m[4]+around.X, m[5]+around.Y
+	return m
 }
 
 // Scaled scales everything around a given point by the scale factor.
@@ -302,36 +303,44 @@ func (m Matrix) Scaled(around Vec, scale float64) Matrix {
 
 // Rotated rotates everything around a given point by the given angle in radians.
 func (m Matrix) Rotated(around Vec, angle float64) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Translate2D(around.Scaled(-1).XY()).Mul3(m3)
-	m3 = mgl64.Rotate3DZ(angle).Mul3(m3)
-	m3 = mgl64.Translate2D(around.XY()).Mul3(m3)
-	return Matrix(m3)
+	sint, cost := math.Sincos(angle)
+	m[4], m[5] = m[4]-around.X, m[5]-around.Y
+	m = m.Chained(Matrix{cost, sint, -sint, cost, 0, 0})
+	m[4], m[5] = m[4]+around.X, m[5]+around.Y
+	return m
 }
 
 // Chained adds another Matrix to this one. All tranformations by the next Matrix will be applied
 // after the transformations of this Matrix.
 func (m Matrix) Chained(next Matrix) Matrix {
-	m3 := mgl64.Mat3(m)
-	m3 = mgl64.Mat3(next).Mul3(m3)
-	return Matrix(m3)
+	return Matrix{
+		m[0]*next[0] + m[2]*next[1],
+		m[1]*next[0] + m[3]*next[1],
+		m[0]*next[2] + m[2]*next[3],
+		m[1]*next[2] + m[3]*next[3],
+		m[0]*next[4] + m[2]*next[5] + m[4],
+		m[1]*next[4] + m[3]*next[5] + m[5],
+	}
 }
 
 // Project applies all transformations added to the Matrix to a vector u and returns the result.
 //
 // Time complexity is O(1).
 func (m Matrix) Project(u Vec) Vec {
-	m3 := mgl64.Mat3(m)
-	proj := m3.Mul3x1(mgl64.Vec3{u.X, u.Y, 1})
-	return V(proj.X(), proj.Y())
+	return Vec{X: m[0]*u.X + m[2]*u.Y + m[4], Y: m[1]*u.X + m[3]*u.Y + m[5]}
 }
 
 // Unproject does the inverse operation to Project.
 //
+// It turns out that multiplying a vector by the inverse matrix of m
+// can be nearly-accomplished by subtracting the translate part of the
+// matrix and multplying by the inverse of the top-left 2x2 matrix,
+// and the inverse of a 2x2 matrix is simple enough to just be
+// inlined in the computation.
+//
 // Time complexity is O(1).
 func (m Matrix) Unproject(u Vec) Vec {
-	m3 := mgl64.Mat3(m)
-	inv := m3.Inv()
-	unproj := inv.Mul3x1(mgl64.Vec3{u.X, u.Y, 1})
-	return V(unproj.X(), unproj.Y())
+	d := (m[0] * m[3]) - (m[1] * m[2])
+	u.X, u.Y = (u.X-m[4])/d, (u.Y-m[5])/d
+	return Vec{u.X*m[3] - u.Y*m[1], u.Y*m[0] - u.X*m[2]}
 }
diff --git a/imdraw/imdraw.go b/imdraw/imdraw.go
index 626cb2f640b305a95cb40d25bb82cba202a4a86c..e5a8c95dbe3b0e3e75ff5a9c265c2ae3c3ed252c 100644
--- a/imdraw/imdraw.go
+++ b/imdraw/imdraw.go
@@ -52,6 +52,7 @@ type IMDraw struct {
 	EndShape  EndShape
 
 	points []point
+	pool   [][]point
 	matrix pixel.Matrix
 	mask   pixel.RGBA
 
@@ -109,7 +110,7 @@ func (imd *IMDraw) Clear() {
 //
 // This does not affect matrix and color mask set by SetMatrix and SetColorMask.
 func (imd *IMDraw) Reset() {
-	imd.points = nil
+	imd.points = imd.points[:0]
 	imd.Color = pixel.Alpha(1)
 	imd.Picture = pixel.ZV
 	imd.Intensity = 0
@@ -256,10 +257,22 @@ func (imd *IMDraw) EllipseArc(radius pixel.Vec, low, high, thickness float64) {
 
 func (imd *IMDraw) getAndClearPoints() []point {
 	points := imd.points
-	imd.points = nil
+	// use one of the existing pools so we don't reallocate as often
+	if len(imd.pool) > 0 {
+		pos := len(imd.pool) - 1
+		imd.points = imd.pool[pos]
+		imd.pool = imd.pool[0:pos]
+	} else {
+		imd.points = nil
+	}
 	return points
 }
 
+func (imd *IMDraw) restorePoints(points []point) {
+	imd.pool = append(imd.pool, imd.points)
+	imd.points = points[:0]
+}
+
 func (imd *IMDraw) applyMatrixAndMask(off int) {
 	for i := range (*imd.tri)[off:] {
 		(*imd.tri)[off+i].Position = imd.matrix.Project((*imd.tri)[off+i].Position)
@@ -271,6 +284,7 @@ func (imd *IMDraw) fillRectangle() {
 	points := imd.getAndClearPoints()
 
 	if len(points) < 2 {
+		imd.restorePoints(points)
 		return
 	}
 
@@ -302,12 +316,14 @@ func (imd *IMDraw) fillRectangle() {
 
 	imd.applyMatrixAndMask(off)
 	imd.batch.Dirty()
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) outlineRectangle(thickness float64) {
 	points := imd.getAndClearPoints()
 
 	if len(points) < 2 {
+		imd.restorePoints(points)
 		return
 	}
 
@@ -323,12 +339,14 @@ func (imd *IMDraw) outlineRectangle(thickness float64) {
 		imd.pushPt(pixel.V(b.pos.X, a.pos.Y), mid)
 		imd.polyline(thickness, true)
 	}
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) fillPolygon() {
 	points := imd.getAndClearPoints()
 
 	if len(points) < 3 {
+		imd.restorePoints(points)
 		return
 	}
 
@@ -336,16 +354,18 @@ func (imd *IMDraw) fillPolygon() {
 	imd.tri.SetLen(imd.tri.Len() + 3*(len(points)-2))
 
 	for i, j := 1, off; i+1 < len(points); i, j = i+1, j+3 {
-		for k, p := range []point{points[0], points[i], points[i+1]} {
-			(*imd.tri)[j+k].Position = p.pos
-			(*imd.tri)[j+k].Color = p.col
-			(*imd.tri)[j+k].Picture = p.pic
-			(*imd.tri)[j+k].Intensity = p.in
+		for k, p := range []int{0, i, i + 1} {
+			tri := &(*imd.tri)[j+k]
+			tri.Position = points[p].pos
+			tri.Color = points[p].col
+			tri.Picture = points[p].pic
+			tri.Intensity = points[p].in
 		}
 	}
 
 	imd.applyMatrixAndMask(off)
 	imd.batch.Dirty()
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) fillEllipseArc(radius pixel.Vec, low, high float64) {
@@ -387,6 +407,7 @@ func (imd *IMDraw) fillEllipseArc(radius pixel.Vec, low, high float64) {
 		imd.applyMatrixAndMask(off)
 		imd.batch.Dirty()
 	}
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) outlineEllipseArc(radius pixel.Vec, low, high, thickness float64, doEndShape bool) {
@@ -485,12 +506,14 @@ func (imd *IMDraw) outlineEllipseArc(radius pixel.Vec, low, high, thickness floa
 			}
 		}
 	}
+	imd.restorePoints(points)
 }
 
 func (imd *IMDraw) polyline(thickness float64, closed bool) {
 	points := imd.getAndClearPoints()
 
 	if len(points) == 0 {
+		imd.restorePoints(points)
 		return
 	}
 	if len(points) == 1 {
@@ -521,6 +544,8 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 	imd.pushPt(points[j].pos.Sub(normal), points[j])
 
 	// middle points
+	// compute "previous" normal:
+	ijNormal := points[1].pos.Sub(points[0].pos).Rotated(math.Pi / 2).Unit().Scaled(thickness / 2)
 	for i := 0; i < len(points); i++ {
 		j, k := i+1, i+2
 
@@ -536,7 +561,6 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 			k %= len(points)
 		}
 
-		ijNormal := points[j].pos.Sub(points[i].pos).Rotated(math.Pi / 2).Unit().Scaled(thickness / 2)
 		jkNormal := points[k].pos.Sub(points[j].pos).Rotated(math.Pi / 2).Unit().Scaled(thickness / 2)
 
 		orientation := 1.0
@@ -567,6 +591,8 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 			imd.pushPt(points[j].pos.Add(jkNormal), points[j])
 			imd.pushPt(points[j].pos.Sub(jkNormal), points[j])
 		}
+		// "next" normal becomes previous normal
+		ijNormal = jkNormal
 	}
 
 	// last point
@@ -591,4 +617,5 @@ func (imd *IMDraw) polyline(thickness float64, closed bool) {
 			imd.fillEllipseArc(pixel.V(thickness/2, thickness/2), normal.Angle(), normal.Angle()-math.Pi)
 		}
 	}
+	imd.restorePoints(points)
 }
diff --git a/pixelgl/canvas.go b/pixelgl/canvas.go
index a08835033d97d51f941cc2480b3c5727ed52bf02..b7b5cfc09e14c44b49ffab454a82b243b92452fc 100644
--- a/pixelgl/canvas.go
+++ b/pixelgl/canvas.go
@@ -90,9 +90,16 @@ func (c *Canvas) MakePicture(p pixel.Picture) pixel.TargetPicture {
 }
 
 // SetMatrix sets a Matrix that every point will be projected by.
+// pixel.Matrix is 3x2 with an implicit 0, 0, 1 row after it. So
+// [0] [2] [4]    [0] [3] [6]
+// [1] [3] [5] => [1] [4] [7]
+//  0   0   1      0   0   1
+// since all matrix ops are affine, the last row never changes,
+// and we don't need to copy it
+//
 func (c *Canvas) SetMatrix(m pixel.Matrix) {
-	for i := range m {
-		c.mat[i] = float32(m[i])
+	for i, j := range [6]int{ 0, 1, 3, 4, 6, 7} {
+		c.mat[j] = float32(m[i])
 	}
 }
 
diff --git a/pixelgl/gltriangles.go b/pixelgl/gltriangles.go
index 64b7355d060845052c506ae524c53e7e23b6eb7b..bf7a8956c34b9656d795c4e2bd2556c16b3f8920 100644
--- a/pixelgl/gltriangles.go
+++ b/pixelgl/gltriangles.go
@@ -103,15 +103,17 @@ func (gt *GLTriangles) updateData(t pixel.Triangles) {
 				tx, ty = (*t)[i].Picture.XY()
 				in     = (*t)[i].Intensity
 			)
-			gt.data[i*gt.vs.Stride()+0] = float32(px)
-			gt.data[i*gt.vs.Stride()+1] = float32(py)
-			gt.data[i*gt.vs.Stride()+2] = float32(col.R)
-			gt.data[i*gt.vs.Stride()+3] = float32(col.G)
-			gt.data[i*gt.vs.Stride()+4] = float32(col.B)
-			gt.data[i*gt.vs.Stride()+5] = float32(col.A)
-			gt.data[i*gt.vs.Stride()+6] = float32(tx)
-			gt.data[i*gt.vs.Stride()+7] = float32(ty)
-			gt.data[i*gt.vs.Stride()+8] = float32(in)
+			s := gt.vs.Stride()
+			d := gt.data[i*s : i*s+9]
+			d[0] = float32(px)
+			d[1] = float32(py)
+			d[2] = float32(col.R)
+			d[3] = float32(col.G)
+			d[4] = float32(col.B)
+			d[5] = float32(col.A)
+			d[6] = float32(tx)
+			d[7] = float32(ty)
+			d[8] = float32(in)
 		}
 		return
 	}