CodeReading: EmacsLisp "Garbage Collection"
Kazuki Ohta, 2005/02/04
前回までで評価器のコア部分に一通り目を通した。しかし、まだ謎なままとなっているのはObjectの生成と消滅を司るGarbage Collectionの部分だ。ELispは数有るGarbage Collection方式の中から、Mark & Sweep方式を選択している。ここではこの方式についての知識が有る事を前提とする。GCの網羅的な情報は、Jones and Lins本に詳しいが、最新のアルゴリズムを知りたい場合は論文を当たった方が早いと思われる。参考URLリスト。 Emacs LispのGarbage Collectionの実態は、src/alloc.cで定義されているFgarbage_collect関数だ。
DEFUN ("garbage-collect", Fgarbage_collect, Sgarbage_collect, 0, 0, "",
doc: /* Reclaim storage for Lisp objects no longer needed.
Garbage collection happens automatically if you cons more than
`gc-cons-threshold' bytes of Lisp data since previous garbage collection.
`garbage-collect' normally returns a list with info on amount of space in use:
((USED-CONSES . FREE-CONSES) (USED-SYMS . FREE-SYMS)
(USED-MARKERS . FREE-MARKERS) USED-STRING-CHARS USED-VECTOR-SLOTS
(USED-FLOATS . FREE-FLOATS) (USED-INTERVALS . FREE-INTERVALS)
(USED-STRINGS . FREE-STRINGS))
However, if there was overflow in pure space, `garbage-collect'
returns nil, because real GC can't be done. */)
()
{
ほげほげ;
}
{
/* mark phase */
for (i = 0; i < staticidx; i++)
mark_object (*staticvec[i]);
mark_stack ();
/* sweep phase */
gc_sweep();
}
という訳でstaticvec, mark_stack, gc_sweepの順番で見て行く事にしよう。まずはstaticvecだが、これはsrc/alloc.cで以下のように定義されている。
#define NSTATICS 1280
Lisp_Object *staticvec[NSTATICS] = {&Vpurify_flag};
/***********************************************************************
Protection from GC
***********************************************************************/
/* Put an entry in staticvec, pointing at the variable with address
VARADDRESS. */
void
staticpro (varaddress)
Lisp_Object *varaddress;
{
staticvec[staticidx++] = varaddress;
if (staticidx >= NSTATICS)
abort ();
}
void
syms_of_data
{
...
staticpro (&Qlambda);
...
}
/* Mark live Lisp objects on the C stack.
There are several system-dependent problems to consider when
porting this to new architectures:
Processor Registers
We have to mark Lisp objects in CPU registers that can hold local
variables or are used to pass parameters.
If GC_SAVE_REGISTERS_ON_STACK is defined, it should expand to
something that either saves relevant registers on the stack, or
calls mark_maybe_object passing it each register's contents.
If GC_SAVE_REGISTERS_ON_STACK is not defined, the current
implementation assumes that calling setjmp saves registers we need
to see in a jmp_buf which itself lies on the stack. This doesn't
have to be true! It must be verified for each system, possibly
by taking a look at the source code of setjmp.
Stack Layout
Architectures differ in the way their processor stack is organized.
For example, the stack might look like this
+----------------+
| Lisp_Object | size = 4
+----------------+
| something else | size = 2
+----------------+
| Lisp_Object | size = 4
+----------------+
| ... |
In such a case, not every Lisp_Object will be aligned equally. To
find all Lisp_Object on the stack it won't be sufficient to walk
the stack in steps of 4 bytes. Instead, two passes will be
necessary, one starting at the start of the stack, and a second
pass starting at the start of the stack + 2. Likewise, if the
minimal alignment of Lisp_Objects on the stack is 1, four passes
would be necessary, each one starting with one byte more offset
from the stack start.
The current code assumes by default that Lisp_Objects are aligned
equally on the stack. */
Lisp_Object
func ()
{
Lisp_Object obj1 = XCONS(Qnil, Qnil);
Lisp_Object obj2 = XCONS(Qnil, Qnil);
return obj1;
}
C stackのアドレスはどうやって得るのだろうか?それはstack_base変数を見れば分かる。stack_baseはsrc/alloc.cで定義されており、src/emacs.cのmain関数で以下のようにして使われている。
/* ARGSUSED */
int
main (argc, argv
#ifdef VMS
, envp
#endif
)
int argc;
char **argv;
#ifdef VMS
char **envp;
#endif
{
#if GC_MARK_STACK
Lisp_Object dummy;
#endif
...
#if GC_MARK_STACK
extern Lisp_Object *stack_base;
stack_base = &dummy;
#endif
...
}
static void
mark_stack ()
{
int i;
jmp_buf j;
volatile int stack_grows_down_p = (char *) &j > (char *) stack_base;
void *end;
/* This trick flushes the register windows so that all the state of
the process is contained in the stack. */
/* Fixme: Code in the Boehm GC suggests flushing (with `flushrs') is
needed on ia64 too. See mach_dep.c, where it also says inline
assembler doesn't work with relevant proprietary compilers. */
#ifdef sparc
asm ("ta 3");
#endif
/* Save registers that we need to see on the stack. We need to see
registers used to hold register variables and registers used to
pass parameters. */
#ifdef GC_SAVE_REGISTERS_ON_STACK
GC_SAVE_REGISTERS_ON_STACK (end);
#else /* not GC_SAVE_REGISTERS_ON_STACK */
#ifndef GC_SETJMP_WORKS /* If it hasn't been checked yet that
setjmp will definitely work, test it
and print a message with the result
of the test. */
if (!setjmp_tested_p)
{
setjmp_tested_p = 1;
test_setjmp ();
}
#endif /* GC_SETJMP_WORKS */
setjmp (j);
end = stack_grows_down_p ? (char *) &j + sizeof j : (char *) &j;
#endif /* not GC_SAVE_REGISTERS_ON_STACK */
/* This assumes that the stack is a contiguous region in memory. If
that's not the case, something has to be done here to iterate
over the stack segments. */
#ifndef GC_LISP_OBJECT_ALIGNMENT
#ifdef __GNUC__
#define GC_LISP_OBJECT_ALIGNMENT __alignof__ (Lisp_Object)
#else
#define GC_LISP_OBJECT_ALIGNMENT sizeof (Lisp_Object)
#endif
#endif
for (i = 0; i < sizeof (Lisp_Object); i += GC_LISP_OBJECT_ALIGNMENT)
mark_memory ((char *) stack_base + i, end);
/* Allow for marking a secondary stack, like the register stack on the
ia64. */
#ifdef GC_MARK_SECONDARY_STACK
GC_MARK_SECONDARY_STACK ();
#endif
#if GC_MARK_STACK == GC_MARK_STACK_CHECK_GCPROS
check_gcpros ();
#endif
}
/* Mark Lisp objects referenced from the address range START..END. */
static void
mark_memory (start, end)
void *start, *end;
{
Lisp_Object *p;
void **pp;
#if GC_MARK_STACK == GC_USE_GCPROS_CHECK_ZOMBIES
nzombies = 0;
#endif
/* Make START the pointer to the start of the memory region,
if it isn't already. */
if (end < start)
{
void *tem = start;
start = end;
end = tem;
}
/* Mark Lisp_Objects. */
for (p = (Lisp_Object *) start; (void *) p < end; ++p)
mark_maybe_object (*p);
/* Mark Lisp data pointed to. This is necessary because, in some
situations, the C compiler optimizes Lisp objects away, so that
only a pointer to them remains. Example:
DEFUN ("testme", Ftestme, Stestme, 0, 0, 0, "")
()
{
Lisp_Object obj = build_string ("test");
struct Lisp_String *s = XSTRING (obj);
Fgarbage_collect ();
fprintf (stderr, "test `%s'\n", s->data);
return Qnil;
}
Here, `obj' isn't really used, and the compiler optimizes it
away. The only reference to the life string is through the
pointer `s'. */
for (pp = (void **) start; (void *) pp < end; ++pp)
mark_maybe_pointer (*pp);
}
/* If P points to Lisp data, mark that as live if it isn't already
marked. */
static INLINE void
mark_maybe_pointer (p)
void *p;
{
struct mem_node *m;
/* Quickly rule out some values which can't point to Lisp data. We
assume that Lisp data is aligned on even addresses. */
if ((EMACS_INT) p & 1)
return;
m = mem_find (p);
if (m != MEM_NIL)
{
Lisp_Object obj = Qnil;
switch (m->type)
{
case MEM_TYPE_NON_LISP:
/* Nothing to do; not a pointer to Lisp memory. */
break;
case MEM_TYPE_BUFFER:
if (live_buffer_p (m, p) && !VECTOR_MARKED_P((struct buffer *)p))
XSETVECTOR (obj, p);
break;
case MEM_TYPE_CONS:
if (live_cons_p (m, p) && !CONS_MARKED_P ((struct Lisp_Cons *) p))
XSETCONS (obj, p);
break;
case MEM_TYPE_STRING:
if (live_string_p (m, p)
&& !STRING_MARKED_P ((struct Lisp_String *) p))
XSETSTRING (obj, p);
break;
case MEM_TYPE_MISC:
if (live_misc_p (m, p) && !((struct Lisp_Free *) p)->gcmarkbit)
XSETMISC (obj, p);
break;
case MEM_TYPE_SYMBOL:
if (live_symbol_p (m, p) && !((struct Lisp_Symbol *) p)->gcmarkbit)
XSETSYMBOL (obj, p);
break;
case MEM_TYPE_FLOAT:
if (live_float_p (m, p) && !FLOAT_MARKED_P (p))
XSETFLOAT (obj, p);
break;
case MEM_TYPE_VECTOR:
case MEM_TYPE_PROCESS:
case MEM_TYPE_HASH_TABLE:
case MEM_TYPE_FRAME:
case MEM_TYPE_WINDOW:
if (live_vector_p (m, p))
{
Lisp_Object tem;
XSETVECTOR (tem, p);
if (!GC_SUBRP (tem) && !VECTOR_MARKED_P (XVECTOR (tem)))
obj = tem;
}
break;
default:
abort ();
}
if (!GC_NILP (obj))
mark_object (obj);
}
}
src/alloc.cのgc_sweep関数では、マークされていないオブジェクトを型毎にfree_listに繋げるという処理を行っている。例えばCons Cellの場合はこんな感じ。
/* Sweep: find all structures not marked, and free them. */
static void
gc_sweep ()
{
...
/* Put all unmarked conses on free list */
{
register struct cons_block *cblk;
struct cons_block **cprev = &cons_block;
register int lim = cons_block_index;
register int num_free = 0, num_used = 0;
cons_free_list = 0;
for (cblk = cons_block; cblk; cblk = *cprev)
{
register int i;
int this_free = 0;
for (i = 0; i < lim; i++)
if (!CONS_MARKED_P (&cblk->conses[i]))
{
this_free++;
cblk->conses[i].u.chain = cons_free_list;
cons_free_list = &cblk->conses[i];
#if GC_MARK_STACK
cons_free_list->car = Vdead;
#endif
}
else
{
num_used++;
CONS_UNMARK (&cblk->conses[i]);
}
lim = CONS_BLOCK_SIZE;
/* If this block contains only free conses and we have already
seen more than two blocks worth of free conses then deallocate
this block. */
if (this_free == CONS_BLOCK_SIZE && num_free > CONS_BLOCK_SIZE)
{
*cprev = cblk->next;
/* Unhook from the free list. */
cons_free_list = cblk->conses[0].u.chain;
lisp_align_free (cblk);
n_cons_blocks--;
}
else
{
num_free += this_free;
cprev = &cblk->next;
}
}
total_conses = num_used;
total_free_conses = num_free;
}
...
}
さて、これで一応Garbage Collectionの部分も理解出来た事にしよう。GCのコードは一回読んでおけば同じアルゴリズムを使用しているコードは大体一緒なので、大変為になる。実際SIODというScheme処理系のGC部分のコードを読んだことがある(mark & sweepを使用)のだが、EmacsのGCのコードと構造が酷似している。さらにRubyも。色々な処理系のGCのコードを見てみるのも面白いかも。
[ return ]