package trainers
import (
"../genome"
"fmt"
"io"
"io/ioutil"
"os"
"malloc"
"strings"
"sort"
)
func init() { fmt.Print("Test") }
type GenePart struct {
DataValue byte
HitNumber uint16
Next *GeneNode
}
type GeneData struct {
DataValue []byte
HitNumber uint16
}
type GeneNode struct {
DataValue byte
Depth uint8
HitNumber uint32
GeneParts [256]GenePart
}
type BruteTreeTrainer struct {
root *GeneNode
curHighest []GeneData
}
func NewBruteTreeTrainer() *BruteTreeTrainer {
bt := new(BruteTreeTrainer)
bt.curHighest = make([]GeneData,0)
bt.root = newTree()
return bt
}
func newTree() *GeneNode{
root := new(GeneNode)
root.Depth = 0
root.DataValue = 0
root.HitNumber = 0
for index, part := range (root.GeneParts) {
part.DataValue = uint8(index)
part.HitNumber =0
}
return root
}
const NextNodeBound int = 2
const MaxDepth int = 15
//creates a new geneNode wit b as it counter
func contains(i int, slice []int) bool {
for _, cur := range (slice) {
if cur == i {
return false
}
}
return true
}
func (gn *GeneNode) numLeafs()(num int){
tally :=0
//for all of the decendent nodes
for _,cur := range(gn.GeneParts){
//ask them how mnay leafs they have
if cur.Next != nil {
num += cur.Next.numLeafs()
tally++
}
}
//if this is a leaf return 1
if tally ==0 {
num = 1
}
return
}
func (gn *GeneNode) numChildern()(num int){
num = 0
for _,cur := range(gn.GeneParts){
if cur.Next!= nil {
num++
}
}
return
}
type GeneDataSlice []GeneData
func join (right, left []GeneData) []GeneData{
//!fmt.Print("j")
newGeneData := make([]GeneData,len(right)+len(left))
for index, item := range(right) {
newGeneData[index] = item
}
for index, item := range(left) {
newGeneData[index+len(right)] = item
}
right = newGeneData
return right
}
func (gn *GeneNode) collapseLeafs()([]GeneData){
//for each child
geneData := make([]GeneData,0)
//!fmt.Println("depth:",gn.Depth, "numChildern:", gn.numChildern
(),"numLeafs:", gn.numLeafs())
//if this is a leaf
//make a slice of the depth that this leaf is at
//put your data value at the end
//the the nodes data value at the next from the end
//return
if gn.numChildern() == 0 {
geneData = make([]GeneData,1)
geneData[0].HitNumber = uint16(gn.HitNumber)
geneData[0].DataValue = make([]byte,gn.Depth)
geneData[0].DataValue[gn.gettingDepth-1] = gn.DataValue
//!fmt.Println(" geneData:",geneData,"\n")
return geneData
}
//if this is a brach
//join all the leafs together
//add this data value to depths place in all of the slice
for _, child := range(gn.GeneParts){
//recurese this function down to its leaf
if(child.Next!=nil){
geneData = join(geneData,child.Next.collapseLeafs())
//! fmt.Println("cumlitive geneData:", geneData)
}
}
for _, curGeneData := range(geneData){
curGeneData.DataValue[gn.Depth] = gn.DataValue
}
///!fmt.Println("Final geneData:", geneData)
return geneData
}
func (gn *GeneNode) deleteLeafs(root *GeneNode){
for index, child := range(gn.GeneParts){
//recurese this function down to its leaf
if(child.Next!=nil){
child.Next.deleteLeafs(root)
//! fmt.Println("cumlitive geneData:", geneData)
gn.GeneParts[index].Next = root
}
malloc.Free(
}
return
}
//find the number of leafs on a node
func (gn *GeneNode) addNode(b byte) {
gn.GeneParts[b].Next = new(GeneNode)
gn.GeneParts[b].Next.Depth = gn.Depth + 1
gn.GeneParts[b].Next.DataValue = uint8(b)
for index, part := range (gn.GeneParts[b].Next.GeneParts) {
part.DataValue = uint8(index)
part.HitNumber = 0
}
return
}
//will take a byte to add to the tree then if it wants anouther byte
for the next node it returns true. if it returns false then it
//hit a leaf
func (gn *GeneNode) AddByte(b []byte) {
//find the next GenePart and add one to its hit number
//!fmt.Println("at:", b[0], " old hit number:", gn.GeneParts[b
[0]].HitNumber, "Depth:",gn.Depth)
gn.GeneParts[b[0]].HitNumber++
gn.HitNumber++
switch {
//if the hit number of a genePart is enugh to create a new node
case gn.GeneParts[b[0]].HitNumber == uint16(NextNodeBound):
//!fmt.Println(" adding node")
//add a node
gn.addNode(b[0])
//if it is greater tehn then Next node
case gn.GeneParts[b[0]].HitNumber > uint16(NextNodeBound):
//!fmt.Println(" calling Down")
//then call AddByte with the ngettingext byte on the next node if
there are bytes left
if cap(b) > 1 && gn.Depth < uint8(MaxDepth) {
gn.GeneParts[b[0]].Next.AddByte(b[1:2])
}
}
return
}
//inverted becuase it need to be sorted greatest to least
func (gd GeneDataSlice)Less(i, j int)bool{
if gd[i].HitNumber >= gd[j].HitNumber {
return true
}else {
return false
}
return false
}
func (gd GeneDataSlice)Swap(i, j int) {
tempd := gd[i]
gd[i] = gd[j]
gd[j] = tempd
return
}
func (gd GeneDataSlice)Len()int{
return len(gd)
}
func (gd GeneDataSlice)Sort(){
sortable := sort.Interface(gd)
fmt.Print("s")
sort.Sort(sortable)
return
}
//should consider changeing this to a io.Reader instead of a []byte to
deal with huge data sets (a movie for example)
func (bt *BruteTreeTrainer) Train(trainingData []byte) {
fmt.Println(len(trainingData))
//from the minimum match size to the max ma
for index := 0; index < len(trainingData); index++ {
if (index % 1000) == 0 {
fmt.Print(",")
}
bt.root.AddByte(trainingData[index : index+1])
}
if bt.root.HitNumber >900000{
//bt.nodeSet()
//bt.curHighest = join(bt.curHighest,bt.nodeSet())
fmt.Println(malloc.GetStats())
bt.root.deleteLeafs(bt.root)
//bt.root = nil
bt.root = newTree()
//malloc.GC()
fmt.Println(malloc.GetStats())
fmt.Print("i should have collocted garbage")
//time.Sleep(1000)
}
}
//Returns the nthHighestNodes this is decied from the top down
//so if there are three nodes that are valid at the top level of the
tree
//and 2 lower down and the user asks for 5 nodes then it will return
//first the top one then the 2 lower ones
func (bt *BruteTreeTrainer) nthHighest(n int) []GeneData {
slice := bt.root.collapseLeafs()
//fmt.Println(slice)
gds := GeneDataSlice(slice)
gds.Sort()
if n > len(gds) {
n = len(gds)-1
}
return gds[0:n]
}
func (bt *BruteTreeTrainer) nodeSet() []GeneData {
slice := bt.root.collapseLeafs()
gds := GeneDataSlice(slice)
return gds
}
func (trainer *BruteTreeTrainer) TrainOnDir(dir string) (err os.Error)
{
//open the directory
files, err := ioutil.ReadDir(dir)
if err != nil {
return err
}
for _, fileDesc := range (files) {
if fileDesc.IsRegular() {
file, err := os.Open(strings.Join([]string{dir, fileDesc.Name},
"/"), os.O_RDONLY, 0666)
if err != nil {
fmt.Println("oops!!", err)
}
fReader := io.Reader(file)
data, err := ioutil.ReadAll(fReader)
trainer.Train(data)
}
}
return
}
//func compareArray(bytes []byte, index, i int) (isThereGene bool) { }
func (trainer *BruteTreeTrainer) TrainOnDirPart(dir string, howMany
int) (err os.Error) {
//open the directory
files, err := ioutil.ReadDir(dir)
fmt.Println("trainging on dir ! = finished file . = finished pass , =
finished pass part")
if err != nil {
return err
}
for x := 0; x <= howMany; x++ {
fileDesc := files[x]
if fileDesc.IsRegular() {
file, err := os.Open(strings.Join([]string{dir, fileDesc.Name},
"/"), os.O_RDONLY, 0666)
if err != nil {
fmt.Println("oops!!", err)
}
fReader := io.Reader(file)
data, err := ioutil.ReadAll(fReader)
trainer.Train(data)
fmt.Print("!")
}
}
return
}
// needs to be implented
func (trainer *BruteTreeTrainer) GetGenes() (genes []genome.Gene, err
os.Error) {
genes = nil
err = nil
return
}
/*
func OpenTrainer(data io.Reader) (err os.Error) {
return
}
func Save(data io.Writer) (err os.Error) {}
*/
func (bt *BruteTreeTrainer) Output() {
fmt.Println("current Highest occurance", bt.nthHighest(5))
return
}
> I am having trouble getting this program to garbage collect when i
> release the root node of the "geneTree" this happens in deleteLeafs
> and is started in train any ideas?
What are the symptoms of the problem?
> func (gn *GeneNode) deleteLeafs(root *GeneNode){
>
> for index, child := range(gn.GeneParts){
> //recurese this function down to its leaf
> if(child.Next!=nil){
> child.Next.deleteLeafs(root)
> //! fmt.Println("cumlitive geneData:", geneData)
> gn.GeneParts[index].Next = root
> }
> malloc.Free(
> }
> return
> }
Your function seems to be missing an argument to malloc.Free.
Ian
On Jan 19, 9:40 am, Ian Lance Taylor <i...@google.com> wrote:
> Eric Fode <ericf...@gmail.com> writes:
> > I am having trouble getting this program to garbage collect when i
> > release the root node of the "geneTree" this happens in deleteLeafs
> > and is started in train any ideas?
>
> What are the symptoms of the problem?
>
> > func (gn *GeneNode) deleteLeafs(root *GeneNode){
>
> > for index, child := range(gn.GeneParts){
> > //recurese this function down to its leaf
> > if(child.Next!=nil){
> > child.Next.deleteLeafs(root)
> > //! fmt.Println("cumlitive geneData:", geneData)
> > gn.GeneParts[index].Next = root
> > }
> >
> > }
> Thy symptoms are that when i set
> every thing to nil from the bottom of the tree up
> it does not return the memory to the operating system after running the
> garbage collector
The way Go works at present, the garbage collector never returns
memory to the operating system. It frees up memory in its internal
data structures and reuses that space for future allocations.
Assuming there are no leaks in your program, what you should expect to
see is that the program grows to some size and then reaches a steady
state.
Ian
//bt.nodeSet()
//bt.curHighest = join(bt.curHighest,bt.nodeSet())
fmt.Println(malloc.GetStats())
bt.root.deleteLeafs(bt.root)
//bt.root = nil
bt.root = newTree()
//malloc.GC()
fmt.Println(malloc.GetStats())
fmt.Print("i should have collocted garbage")
//time.Sleep(1000)
the call to malloc.GC should be uncommented.
Otherwise the GC will happen the next time malloc decides
enough allocations have happened since the last collection.
I cut out the dead code from your program and told it to train on
$GOROOT/doc (full code below), and I get reasonable output from
alloc := malloc.GetStats().Alloc
bt.root.deleteLeafs(nil)
bt.root = nil
malloc.GC()
fmt.Println("Collected", alloc-malloc.GetStats().Alloc, "out of", alloc)
bt.root = new(GeneNode)
which prints
; 6g x.go
; 6l x.6
; 6.out
Collected 1237281024 out of 1248889616
Collected 1266371840 out of 1277115504
;
Russ
--- x.go
package main
import (
"fmt"
"io/ioutil"
"os"
"malloc"
)
const (
NextNodeBound = 2
MaxDepth = 15
)
type GenePart struct {
DataValue byte
HitNumber uint16
Next *GeneNode
}
type GeneData struct {
DataValue []byte
HitNumber uint16
}
type GeneNode struct {
DataValue byte
Depth uint8
HitNumber uint32
GeneParts [256]GenePart
}
type BruteTreeTrainer struct {
root *GeneNode
curHighest []GeneData
}
func NewBruteTreeTrainer() *BruteTreeTrainer {
return &BruteTreeTrainer{
new(GeneNode),
make([]GeneData, 0),
}
}
func (gn *GeneNode) deleteLeafs(root *GeneNode) {
for i := range gn.GeneParts {
p := &gn.GeneParts[i]
if p.Next != nil {
p.Next.deleteLeafs(root)
p.Next = root
}
}
}
func (gn *GeneNode) addNode(b byte) {
p := &gn.GeneParts[b]
p.Next = new(GeneNode)
p.Next.Depth = gn.Depth + 1
p.Next.DataValue = uint8(b)
for i, part := range p.Next.GeneParts {
part.DataValue = uint8(i)
part.HitNumber = 0
}
}
func (gn *GeneNode) AddByte(b []byte) {
p := &gn.GeneParts[b[0]]
p.HitNumber++
gn.HitNumber++
switch {
case p.HitNumber == NextNodeBound:
gn.addNode(b[0])
case p.HitNumber > NextNodeBound:
if cap(b) > 1 && gn.Depth < MaxDepth {
p.Next.AddByte(b[1:2])
}
}
}
func (bt *BruteTreeTrainer) Train(data []byte) {
for i := 0; i < len(data); i++ {
bt.root.AddByte(data[i : i+1])
}
if bt.root.HitNumber > 900000 {
alloc := malloc.GetStats().Alloc
bt.root.deleteLeafs(nil)
bt.root = nil
malloc.GC()
fmt.Println("Collected", alloc-malloc.GetStats().Alloc, "out of", alloc)
bt.root = new(GeneNode)
}
}
func (trainer *BruteTreeTrainer) TrainOnDir(dir string) (err os.Error) {
dirs, err := ioutil.ReadDir(dir)
if err != nil {
return err
}
for _, d := range dirs {
if d.IsRegular() {
file, err := os.Open(dir+"/"+d.Name, os.O_RDONLY, 0666)
if err != nil {
fmt.Println("oops!!", err)
}
data, err := ioutil.ReadAll(file)
trainer.Train(data)
}
}
return
}
func main() {
bt := NewBruteTreeTrainer()
bt.TrainOnDir(os.Getenv("GOROOT") + "/doc")
}
The garbage collector reclaims cyclic data too.
In the program I posted before, if I remove the call
to deleteLeafs, I get:
; 6.out
Collected 1087808512 out of 1248889648
Collected 1243567216 out of 1411300608
;
The collector probably sees some garbage on the
stack that looks like pointers and keeps it from
reclaiming as much as it did with deleteLeafs, but
it will get the rest on a future round.
> Is there any way to use memory pages?
I don't understand the question.
Normal allocation uses memory, in pages.
Russ
Swap doesn't work that way.
It sounds to me like you're looking to make a file.
- jessta
--
=====================
http://jessta.id.au
Are you talking about memory-mapped files?
http://en.wikipedia.org/wiki/Memory-mapped_file
> I've run in to this in other garbage-collected language
> implementations and still fail to understand why anyone thinks this
> is advisable behavior versus returning the memory to the OS.
The heap works this way in most languages by default and in most
applications which do their own memory management. It's because most
programs tend to reach a steady state in memory usage. Returning
memory to the OS is often soon followed by allocating memory from the
OS. Both OS operations are significantly more expensive than managing
a free list in allocated memory.
Ian
Memory is used for a lot more things than your precious VM, and the OS is there and highly tuned to optimize its usage and behavior- so make use of it :)
-SS
--
NUNQUAM NON PARATUS
V: 408.718.6290
It of course makes sense to maintain a bit of app-internal memory overhead to accommodate the thrashing needs of the app, but even just a simple last_access->timeout->free is all that's needed to let the OS do its job while still preserving that functionality-
--
NUNQUAM NON PARATUS
V: 408.718.6290
What problem do you see in "never returned to the OS"? If a process
doesn't use some of it's allocated virtual memory space (and possibly
marks it as not needing secondary storage backing) then this costs
virtually zero real memory as soon as other processes demands
allocations of more than available real memory. And if they don't,
then (also) nothing is lost.
-bflm
But is it still scanned by the conservative GC?
--
Dr Jon Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?e
My point is that the OS cannot swap out the unused space if it is being
scanned by the GC. The GC will force it to remain resident.
> On Wednesday 20 January 2010 15:45:48 Fode wrote:
>> I am not sure what you are asking, but from the looks of the bug report
>> that i posted a link to i would say that while it may be scanned it is not
>> really collected
>
> My point is that the OS cannot swap out the unused space if it is being
> scanned by the GC. The GC will force it to remain resident.
The GC is being rewritten, so the current GC isn't too meaningful. In
any case, the current GC only scans allocated blocks. Of course,
there may be fragmentation.
Ian
> is that a temporary fix we can work on to make it so that it scans the stack
> when it starts to get to big? or am i misunderstanding how it works?
Sorry, I think I lost some context. I'm not sure quite what the
problem is.
The garbage collection code is mainly in src/pkg/runtime/mgc0.c. The
data structures are documented in src/pkg/runtime/malloc.h. Please
feel free to take a look, with the understanding that much of it is
going to change.
Ian
No, the Go runtime's garbage collector does not scan
freed memory. In fact I am not aware of any garbage
collection algorithm that does. (What would it hope to find?)
Russ
You seem to be talking about the bug that keeps blocks
from being freed, because something on the current stack
looks like a pointer into the block, either because of old
temporary variables or accidents (integers that look like pointers).
When we do replace the garbage collector, we plan to take a
closer look at which it is. I suspect the former, and it may be
cheap and worthwhile to eagerly zero out temporary locations
used for pointers once they are dead. Or it may not, but we're
certainly going to look at that.
Everyone else seems to be talking about the fact that freed
memory is held in reserve for future allocation rather than
returned to the OS. As Ian said, it's common behavior and
generally not considered a bug. That said, the Go memory
allocator is based on Google's open source tcmalloc allocator,
which does have mechanisms for releasing memory back to the OS,
using timing and other considerations to make good decisions.
I expect that once we're happy with the garbage collector it may
make sense to adopt some of those mechanisms. Or it may not,
since they are tuned to manual allocation and freeing, but again
we're going to look at it.
Russ