Aside from those two things, it seems to work, and the example below gives matching results for precalculated and final size of the bitmap.
for(char c : sample)
{
FT_Load_Char(face, (int)c, FT_LOAD_TARGET_LCD);
FT_BBox cbox;
FT_Outline_Get_CBox(&face->glyph->outline, &cbox);
cbox.xMax += 21;
cbox.xMin -= 21;
cbox.xMin = FT_PIX_FLOOR(cbox.xMin);
cbox.yMin = FT_PIX_FLOOR(cbox.yMin);
cbox.xMax = FT_PIX_CEIL(cbox.xMax);
cbox.yMax = FT_PIX_CEIL(cbox.yMax);
FT_ULong width = (FT_ULong)(cbox.xMax - cbox.xMin) >> 6;
FT_ULong height = (FT_ULong)(cbox.yMax - cbox.yMin) >> 6;
FT_ULong pitch = width;
width *= 3;
pitch = FT_PAD_CEIL(width, 4);
cout << "precalc: "
<< width / 3 << " "
<< height << endl;
FT_Load_Char(face, (int)c, FT_LOAD_TARGET_LCD | FT_LOAD_RENDER);
cout << "rendered: "
<< (face->glyph->bitmap.width / 3) << " "
<< (face->glyph->bitmap.rows) << endl << endl;
}