@ -26,7 +26,6 @@ import (
"os"
"os"
"path"
"path"
"path/filepath"
"path/filepath"
"strings"
"sync"
"sync"
"time"
"time"
)
)
@ -37,7 +36,14 @@ func init() {
type commandVolumeFsck struct {
type commandVolumeFsck struct {
env * CommandEnv
env * CommandEnv
writer io . Writer
bucketsPath string
collection * string
volumeId * uint
tempFolder string
verbose * bool
forcePurging * bool
forcePurging * bool
findMissingChunksInFiler * bool
}
}
func ( c * commandVolumeFsck ) Name ( ) string {
func ( c * commandVolumeFsck ) Name ( ) string {
@ -67,10 +73,10 @@ func (c *commandVolumeFsck) Help() string {
func ( c * commandVolumeFsck ) Do ( args [ ] string , commandEnv * CommandEnv , writer io . Writer ) ( err error ) {
func ( c * commandVolumeFsck ) Do ( args [ ] string , commandEnv * CommandEnv , writer io . Writer ) ( err error ) {
fsckCommand := flag . NewFlagSet ( c . Name ( ) , flag . ContinueOnError )
fsckCommand := flag . NewFlagSet ( c . Name ( ) , flag . ContinueOnError )
verbose : = fsckCommand . Bool ( "v" , false , "verbose mode" )
findMissingChunksInFiler : = fsckCommand . Bool ( "findMissingChunksInFiler" , false , "see \"help volume.fsck\"" )
findMissingChunksInFilerPath : = fsckCommand . String ( "findMissingChunksInFilerPath " , "/ " , "used together with findMissingChunksInFiler " )
findMissingChunksInVolumeId := fsckCommand . Int ( "findMissingChunksInV olumeId", 0 , "used together with findMissingChunksInFiler " )
c . verbose = fsckCommand . Bool ( "v" , false , "verbose mode" )
c . findMissingChunksInFiler = fsckCommand . Bool ( "findMissingChunksInFiler" , false , "see \"help volume.fsck\"" )
c . collection = fsckCommand . String ( "collection " , "" , "the collection name " )
c . volumeId = fsckCommand . Uint ( "v olumeId", 0 , "the volume id " )
applyPurging := fsckCommand . Bool ( "reallyDeleteFromVolume" , false , "<expert only!> after detection, delete missing data from volumes / delete missing file entries from filer. Currently this only works with default filerGroup." )
applyPurging := fsckCommand . Bool ( "reallyDeleteFromVolume" , false , "<expert only!> after detection, delete missing data from volumes / delete missing file entries from filer. Currently this only works with default filerGroup." )
c . forcePurging = fsckCommand . Bool ( "forcePurging" , false , "delete missing data from volumes in one replica used together with applyPurging" )
c . forcePurging = fsckCommand . Bool ( "forcePurging" , false , "delete missing data from volumes in one replica used together with applyPurging" )
purgeAbsent := fsckCommand . Bool ( "reallyDeleteFilerEntries" , false , "<expert only!> delete missing file entries from filer if the corresponding volume is missing for any reason, please ensure all still existing/expected volumes are connected! used together with findMissingChunksInFiler" )
purgeAbsent := fsckCommand . Bool ( "reallyDeleteFilerEntries" , false , "<expert only!> delete missing file entries from filer if the corresponding volume is missing for any reason, please ensure all still existing/expected volumes are connected! used together with findMissingChunksInFiler" )
@ -86,76 +92,78 @@ func (c *commandVolumeFsck) Do(args []string, commandEnv *CommandEnv, writer io.
}
}
c . env = commandEnv
c . env = commandEnv
c . writer = writer
c . bucketsPath , err = readFilerBucketsPath ( commandEnv )
if err != nil {
return fmt . Errorf ( "read filer buckets path: %v" , err )
}
// create a temp folder
// create a temp folder
tempFolder , err := os . MkdirTemp ( * tempPath , "sw_fsck" )
c . tempFolder , err = os . MkdirTemp ( * tempPath , "sw_fsck" )
if err != nil {
if err != nil {
return fmt . Errorf ( "failed to create temp folder: %v" , err )
return fmt . Errorf ( "failed to create temp folder: %v" , err )
}
}
if * verbose {
fmt . Fprintf ( writer , "working directory: %s\n" , tempFolder )
if * c . verbose {
fmt . Fprintf ( c . writer , "working directory: %s\n" , c . tempFolder )
}
}
defer os . RemoveAll ( tempFolder )
defer os . RemoveAll ( c . tempFolder )
// collect all volume id locations
// collect all volume id locations
dataNodeVolumeIdToVInfo , err := c . collectVolumeIds ( commandEnv , * verbose , writer )
dataNodeVolumeIdToVInfo , err := c . collectVolumeIds ( )
if err != nil {
if err != nil {
return fmt . Errorf ( "failed to collect all volume locations: %v" , err )
return fmt . Errorf ( "failed to collect all volume locations: %v" , err )
}
}
isBucketsPath := false
var fillerBucketsPath string
if * findMissingChunksInFiler && * findMissingChunksInFilerPath != "/" {
fillerBucketsPath , err = readFilerBucketsPath ( commandEnv )
if err != nil {
return fmt . Errorf ( "read filer buckets path: %v" , err )
}
if strings . HasPrefix ( * findMissingChunksInFilerPath , fillerBucketsPath ) {
isBucketsPath = true
}
}
if err != nil {
if err != nil {
return fmt . Errorf ( "read filer buckets path: %v" , err )
return fmt . Errorf ( "read filer buckets path: %v" , err )
}
}
collectMtime := time . Now ( ) . Unix ( )
collectMtime := time . Now ( ) . UnixNano ( )
// collect each volume file ids
// collect each volume file ids
for dataNodeId , volumeIdToVInfo := range dataNodeVolumeIdToVInfo {
for dataNodeId , volumeIdToVInfo := range dataNodeVolumeIdToVInfo {
for volumeId , vinfo := range volumeIdToVInfo {
for volumeId , vinfo := range volumeIdToVInfo {
if * findMissingChunksInV olumeId > 0 && uint32 ( * findMissingChunksInV olumeId) != volumeId {
if * c . volumeId > 0 && uint32 ( * c . volumeId ) != volumeId {
delete ( volumeIdToVInfo , volumeId )
delete ( volumeIdToVInfo , volumeId )
continue
continue
}
}
if isBucketsPath && ! strings . HasPrefix ( * findMissingChunksInFilerPath , fillerBucketsPath + "/" + vinfo . collection ) {
// or skip /topics/.system/log without collection name
if ( * c . collection != "" && vinfo . collection != * c . collection ) || vinfo . collection == "" {
delete ( volumeIdToVInfo , volumeId )
delete ( volumeIdToVInfo , volumeId )
continue
continue
}
}
if * c . volumeId > 0 && * c . collection == "" {
* c . collection = vinfo . collection
}
cutoffFrom := time . Now ( ) . Add ( - * cutoffTimeAgo ) . UnixNano ( )
cutoffFrom := time . Now ( ) . Add ( - * cutoffTimeAgo ) . UnixNano ( )
err = c . collectOneVolumeFileIds ( tempFolder , dataNodeId , volumeId , vinfo , * verbose , writer , uint64 ( cutoffFrom ) )
err = c . collectOneVolumeFileIds ( dataNodeId , volumeId , vinfo , uint64 ( cutoffFrom ) )
if err != nil {
if err != nil {
return fmt . Errorf ( "failed to collect file ids from volume %d on %s: %v" , volumeId , vinfo . server , err )
return fmt . Errorf ( "failed to collect file ids from volume %d on %s: %v" , volumeId , vinfo . server , err )
}
}
}
}
if * c . verbose {
fmt . Fprintf ( c . writer , "dn %+v filtred %d volumes and locations.\n" , dataNodeId , len ( dataNodeVolumeIdToVInfo [ dataNodeId ] ) )
}
}
}
if * findMissingChunksInFiler {
if * c . findMissingChunksInFiler {
// collect all filer file ids and paths
// collect all filer file ids and paths
if err = c . collectFilerFileIdAndPaths ( dataNodeVolumeIdToVInfo , tempFolder , writer , * findMissingChunksInFilerPath , * verbose , * purgeAbsent , collectMtime ) ; err != nil {
if err = c . collectFilerFileIdAndPaths ( dataNodeVolumeIdToVInfo , * purgeAbsent , collectMtime ) ; err != nil {
return fmt . Errorf ( "collectFilerFileIdAndPaths: %v" , err )
return fmt . Errorf ( "collectFilerFileIdAndPaths: %v" , err )
}
}
for dataNodeId , volumeIdToVInfo := range dataNodeVolumeIdToVInfo {
for dataNodeId , volumeIdToVInfo := range dataNodeVolumeIdToVInfo {
// for each volume, check filer file ids
// for each volume, check filer file ids
if err = c . findFilerChunksMissingInVolumeServers ( volumeIdToVInfo , tempFolder , dataNodeId , writer , * verbose , * applyPurging ) ; err != nil {
if err = c . findFilerChunksMissingInVolumeServers ( volumeIdToVInfo , dataNodeId , * applyPurging ) ; err != nil {
return fmt . Errorf ( "findFilerChunksMissingInVolumeServers: %v" , err )
return fmt . Errorf ( "findFilerChunksMissingInVolumeServers: %v" , err )
}
}
}
}
} else {
} else {
// collect all filer file ids
// collect all filer file ids
if err = c . collectFilerFileIds ( dataNodeVolumeIdToVInfo , tempFolder , writer , * verbos e) ; err != nil {
if err = c . collectFilerFileIdAndPath s ( dataNodeVolumeIdToVInfo , false , collectMtim e) ; err != nil {
return fmt . Errorf ( "failed to collect file ids from filer: %v" , err )
return fmt . Errorf ( "failed to collect file ids from filer: %v" , err )
}
}
// volume file ids subtract filer file ids
// volume file ids subtract filer file ids
if err = c . findExtraChunksInVolumeServers ( dataNodeVolumeIdToVInfo , tempFolder , writer , * verbose , * applyPurging ) ; err != nil {
if err = c . findExtraChunksInVolumeServers ( dataNodeVolumeIdToVInfo , * applyPurging ) ; err != nil {
return fmt . Errorf ( "findExtraChunksInVolumeServers: %v" , err )
return fmt . Errorf ( "findExtraChunksInVolumeServers: %v" , err )
}
}
}
}
@ -163,10 +171,9 @@ func (c *commandVolumeFsck) Do(args []string, commandEnv *CommandEnv, writer io.
return nil
return nil
}
}
func ( c * commandVolumeFsck ) collectFilerFileIdAndPaths ( dataNodeVolumeIdToVInfo map [ string ] map [ uint32 ] VInfo , tempFolder string , writer io . Writer , filerPath string , verbose bool , purgeAbsent bool , collectMtime int64 ) error {
if verbose {
fmt . Fprintf ( writer , "checking each file from filer ...\n" )
func ( c * commandVolumeFsck ) collectFilerFileIdAndPaths ( dataNodeVolumeIdToVInfo map [ string ] map [ uint32 ] VInfo , purgeAbsent bool , collectMtime int64 ) error {
if * c . verbose {
fmt . Fprintf ( c . writer , "checking each file from filer path %s...\n" , c . getCollectFilerFilePath ( ) )
}
}
files := make ( map [ uint32 ] * os . File )
files := make ( map [ uint32 ] * os . File )
@ -175,9 +182,9 @@ func (c *commandVolumeFsck) collectFilerFileIdAndPaths(dataNodeVolumeIdToVInfo m
if _ , ok := files [ vid ] ; ok {
if _ , ok := files [ vid ] ; ok {
continue
continue
}
}
dst , openErr := os . OpenFile ( getFilerFileIdFile ( tempFolder , vid ) , os . O_WRONLY | os . O_CREATE | os . O_TRUNC , 0644 )
dst , openErr := os . OpenFile ( getFilerFileIdFile ( c . tempFolder , vid ) , os . O_WRONLY | os . O_CREATE | os . O_TRUNC , 0644 )
if openErr != nil {
if openErr != nil {
return fmt . Errorf ( "failed to create file %s: %v" , getFilerFileIdFile ( tempFolder , vid ) , openErr )
return fmt . Errorf ( "failed to create file %s: %v" , getFilerFileIdFile ( c . tempFolder , vid ) , openErr )
}
}
files [ vid ] = dst
files [ vid ] = dst
}
}
@ -188,19 +195,14 @@ func (c *commandVolumeFsck) collectFilerFileIdAndPaths(dataNodeVolumeIdToVInfo m
}
}
} ( )
} ( )
type Item struct {
vid uint32
fileKey uint64
cookie uint32
path util . FullPath
}
return doTraverseBfsAndSaving ( c . env , nil , filerPath , false , func ( entry * filer_pb . FullEntry , outputChan chan interface { } ) ( err error ) {
if verbose && entry . Entry . IsDirectory {
fmt . Fprintf ( writer , "checking directory %s\n" , util . NewFullPath ( entry . Dir , entry . Entry . Name ) )
return doTraverseBfsAndSaving ( c . env , nil , c . getCollectFilerFilePath ( ) , false ,
func ( entry * filer_pb . FullEntry , outputChan chan interface { } ) ( err error ) {
if * c . verbose && entry . Entry . IsDirectory {
fmt . Fprintf ( c . writer , "checking directory %s\n" , util . NewFullPath ( entry . Dir , entry . Entry . Name ) )
}
}
dataChunks , manifestChunks , resolveErr := filer . ResolveChunkManifest ( filer . LookupFn ( c . env ) , entry . Entry . Chunks , 0 , math . MaxInt64 )
dataChunks , manifestChunks , resolveErr := filer . ResolveChunkManifest ( filer . LookupFn ( c . env ) , entry . Entry . Chunks , 0 , math . MaxInt64 )
if resolveErr != nil {
if resolveErr != nil {
return nil
return fmt . Errorf ( "failed to ResolveChunkManifest: %+v" , resolveErr )
}
}
dataChunks = append ( dataChunks , manifestChunks ... )
dataChunks = append ( dataChunks , manifestChunks ... )
for _ , chunk := range dataChunks {
for _ , chunk := range dataChunks {
@ -215,7 +217,8 @@ func (c *commandVolumeFsck) collectFilerFileIdAndPaths(dataNodeVolumeIdToVInfo m
}
}
}
}
return nil
return nil
} , func ( outputChan chan interface { } ) {
} ,
func ( outputChan chan interface { } ) {
buffer := make ( [ ] byte , 16 )
buffer := make ( [ ] byte , 16 )
for item := range outputChan {
for item := range outputChan {
i := item . ( * Item )
i := item . ( * Item )
@ -225,23 +228,21 @@ func (c *commandVolumeFsck) collectFilerFileIdAndPaths(dataNodeVolumeIdToVInfo m
util . Uint32toBytes ( buffer [ 12 : ] , uint32 ( len ( i . path ) ) )
util . Uint32toBytes ( buffer [ 12 : ] , uint32 ( len ( i . path ) ) )
f . Write ( buffer )
f . Write ( buffer )
f . Write ( [ ] byte ( i . path ) )
f . Write ( [ ] byte ( i . path ) )
// fmt.Fprintf(writer, "%d,%x%08x %d %s\n", i.vid, i.fileKey, i.cookie, len(i.path), i.path)
} else {
fmt . Fprintf ( writer , "%d,%x%08x %s volume not found\n" , i . vid , i . fileKey , i . cookie , i . path )
} else if * c . findMissingChunksInFiler && * c . volumeId == 0 {
fmt . Fprintf ( c . writer , "%d,%x%08x %s volume not found\n" , i . vid , i . fileKey , i . cookie , i . path )
if purgeAbsent {
if purgeAbsent {
fmt . Printf ( "deleting path %s after volume not found" , i . path )
fmt . Printf ( "deleting path %s after volume not found" , i . path )
c . httpDelete ( i . path , verbose )
c . httpDelete ( i . path )
}
}
}
}
}
}
} )
} )
}
}
func ( c * commandVolumeFsck ) findFilerChunksMissingInVolumeServers ( volumeIdToVInfo map [ uint32 ] VInfo , tempFolder string , dataNodeId string , writer io . Writer , verbose bool , applyPurging bool ) error {
func ( c * commandVolumeFsck ) findFilerChunksMissingInVolumeServers ( volumeIdToVInfo map [ uint32 ] VInfo , dataNodeId string , applyPurging bool ) error {
for volumeId , vinfo := range volumeIdToVInfo {
for volumeId , vinfo := range volumeIdToVInfo {
checkErr := c . oneVolumeFileIdsCheckOneVolume ( tempFolder , dataNodeId , volumeId , writer , verbose , applyPurging )
checkErr := c . oneVolumeFileIdsCheckOneVolume ( dataNodeId , volumeId , applyPurging )
if checkErr != nil {
if checkErr != nil {
return fmt . Errorf ( "failed to collect file ids from volume %d on %s: %v" , volumeId , vinfo . server , checkErr )
return fmt . Errorf ( "failed to collect file ids from volume %d on %s: %v" , volumeId , vinfo . server , checkErr )
}
}
@ -249,7 +250,7 @@ func (c *commandVolumeFsck) findFilerChunksMissingInVolumeServers(volumeIdToVInf
return nil
return nil
}
}
func ( c * commandVolumeFsck ) findExtraChunksInVolumeServers ( dataNodeVolumeIdToVInfo map [ string ] map [ uint32 ] VInfo , tempFolder string , writer io . Writer , verbose bool , applyPurging bool ) error {
func ( c * commandVolumeFsck ) findExtraChunksInVolumeServers ( dataNodeVolumeIdToVInfo map [ string ] map [ uint32 ] VInfo , applyPurging bool ) error {
var totalInUseCount , totalOrphanChunkCount , totalOrphanDataSize uint64
var totalInUseCount , totalOrphanChunkCount , totalOrphanDataSize uint64
volumeIdOrphanFileIds := make ( map [ uint32 ] map [ string ] bool )
volumeIdOrphanFileIds := make ( map [ uint32 ] map [ string ] bool )
@ -259,7 +260,7 @@ func (c *commandVolumeFsck) findExtraChunksInVolumeServers(dataNodeVolumeIdToVIn
serverReplicas := make ( map [ uint32 ] [ ] pb . ServerAddress )
serverReplicas := make ( map [ uint32 ] [ ] pb . ServerAddress )
for dataNodeId , volumeIdToVInfo := range dataNodeVolumeIdToVInfo {
for dataNodeId , volumeIdToVInfo := range dataNodeVolumeIdToVInfo {
for volumeId , vinfo := range volumeIdToVInfo {
for volumeId , vinfo := range volumeIdToVInfo {
inUseCount , orphanFileIds , orphanDataSize , checkErr := c . oneVolumeFileIdsSubtractFilerFileIds ( tempFolder , dataNodeId , volumeId , writer , verbose )
inUseCount , orphanFileIds , orphanDataSize , checkErr := c . oneVolumeFileIdsSubtractFilerFileIds ( dataNodeId , volumeId , vinfo . collection )
if checkErr != nil {
if checkErr != nil {
return fmt . Errorf ( "failed to collect file ids from volume %d on %s: %v" , volumeId , vinfo . server , checkErr )
return fmt . Errorf ( "failed to collect file ids from volume %d on %s: %v" , volumeId , vinfo . server , checkErr )
}
}
@ -282,9 +283,9 @@ func (c *commandVolumeFsck) findExtraChunksInVolumeServers(dataNodeVolumeIdToVIn
totalOrphanChunkCount += uint64 ( len ( orphanFileIds ) )
totalOrphanChunkCount += uint64 ( len ( orphanFileIds ) )
totalOrphanDataSize += orphanDataSize
totalOrphanDataSize += orphanDataSize
if verbose {
if * c . verbose {
for _ , fid := range orphanFileIds {
for _ , fid := range orphanFileIds {
fmt . Fprintf ( writer , "%s\n" , fid )
fmt . Fprintf ( c . writer , "%s\n" , fid )
}
}
}
}
isEcVolumeReplicas [ volumeId ] = vinfo . isEcVolume
isEcVolumeReplicas [ volumeId ] = vinfo . isEcVolume
@ -307,12 +308,12 @@ func (c *commandVolumeFsck) findExtraChunksInVolumeServers(dataNodeVolumeIdToVIn
if ! ( len ( orphanFileIds ) > 0 ) {
if ! ( len ( orphanFileIds ) > 0 ) {
continue
continue
}
}
if verbose {
fmt . Fprintf ( writer , "purging process for volume %d.\n" , volumeId )
if * c . verbose {
fmt . Fprintf ( c . writer , "purging process for volume %d.\n" , volumeId )
}
}
if isEcVolumeReplicas [ volumeId ] {
if isEcVolumeReplicas [ volumeId ] {
fmt . Fprintf ( writer , "skip purging for Erasure Coded volume %d.\n" , volumeId )
fmt . Fprintf ( c . writer , "skip purging for Erasure Coded volume %d.\n" , volumeId )
continue
continue
}
}
for _ , server := range serverReplicas [ volumeId ] {
for _ , server := range serverReplicas [ volumeId ] {
@ -323,17 +324,17 @@ func (c *commandVolumeFsck) findExtraChunksInVolumeServers(dataNodeVolumeIdToVIn
if err != nil {
if err != nil {
return fmt . Errorf ( "mark volume %d read/write: %v" , volumeId , err )
return fmt . Errorf ( "mark volume %d read/write: %v" , volumeId , err )
}
}
fmt . Fprintf ( writer , "temporarily marked %d on server %v writable for forced purge\n" , volumeId , server )
fmt . Fprintf ( c . writer , "temporarily marked %d on server %v writable for forced purge\n" , volumeId , server )
defer markVolumeWritable ( c . env . option . GrpcDialOption , needleVID , server , false )
defer markVolumeWritable ( c . env . option . GrpcDialOption , needleVID , server , false )
fmt . Fprintf ( writer , "marked %d on server %v writable for forced purge\n" , volumeId , server )
fmt . Fprintf ( c . writer , "marked %d on server %v writable for forced purge\n" , volumeId , server )
}
}
if verbose {
fmt . Fprintf ( writer , "purging files from volume %d\n" , volumeId )
if * c . verbose {
fmt . Fprintf ( c . writer , "purging files from volume %d\n" , volumeId )
}
}
if err := c . purgeFileIdsForOneVolume ( volumeId , orphanFileIds , writer ) ; err != nil {
if err := c . purgeFileIdsForOneVolume ( volumeId , orphanFileIds ) ; err != nil {
return fmt . Errorf ( "purging volume %d: %v" , volumeId , err )
return fmt . Errorf ( "purging volume %d: %v" , volumeId , err )
}
}
}
}
@ -342,28 +343,27 @@ func (c *commandVolumeFsck) findExtraChunksInVolumeServers(dataNodeVolumeIdToVIn
if ! applyPurging {
if ! applyPurging {
pct := float64 ( totalOrphanChunkCount * 100 ) / ( float64 ( totalOrphanChunkCount + totalInUseCount ) )
pct := float64 ( totalOrphanChunkCount * 100 ) / ( float64 ( totalOrphanChunkCount + totalInUseCount ) )
fmt . Fprintf ( writer , "\nTotal\t\tentries:%d\torphan:%d\t%.2f%%\t%dB\n" ,
fmt . Fprintf ( c . writer , "\nTotal\t\tentries:%d\torphan:%d\t%.2f%%\t%dB\n" ,
totalOrphanChunkCount + totalInUseCount , totalOrphanChunkCount , pct , totalOrphanDataSize )
totalOrphanChunkCount + totalInUseCount , totalOrphanChunkCount , pct , totalOrphanDataSize )
fmt . Fprintf ( writer , "This could be normal if multiple filers or no filers are used.\n" )
fmt . Fprintf ( c . writer , "This could be normal if multiple filers or no filers are used.\n" )
}
}
if totalOrphanChunkCount == 0 {
if totalOrphanChunkCount == 0 {
fmt . Fprintf ( writer , "no orphan data\n" )
//return nil
fmt . Fprintf ( c . writer , "no orphan data\n" )
}
}
return nil
return nil
}
}
func ( c * commandVolumeFsck ) collectOneVolumeFileIds ( tempFolder string , dataNodeId string , volumeId uint32 , vinfo VInfo , verbose bool , writer io . Writer , cutoffFrom uint64 ) error {
func ( c * commandVolumeFsck ) collectOneVolumeFileIds ( dataNodeId string , volumeId uint32 , vinfo VInfo , cutoffFrom uint64 ) error {
if verbose {
fmt . Fprintf ( writer , "collecting volume %d file ids from %s ...\n" , volumeId , vinfo . server )
if * c . verbose {
fmt . Fprintf ( c . writer , "collecting volume %d file ids from %s ...\n" , volumeId , vinfo . server )
}
}
return operation . WithVolumeServerClient ( false , vinfo . server , c . env . option . GrpcDialOption , func ( volumeServerClient volume_server_pb . VolumeServerClient ) error {
return operation . WithVolumeServerClient ( false , vinfo . server , c . env . option . GrpcDialOption ,
func ( volumeServerClient volume_server_pb . VolumeServerClient ) error {
ext := ".idx"
ext := ".idx"
if vinfo . isEcVolume {
if vinfo . isEcVolume {
ext = ".ecx"
ext = ".ecx"
@ -395,7 +395,8 @@ func (c *commandVolumeFsck) collectOneVolumeFileIds(tempFolder string, dataNodeI
}
}
fileredBuf := filterDeletedNeedleFromIdx ( buf . Bytes ( ) )
fileredBuf := filterDeletedNeedleFromIdx ( buf . Bytes ( ) )
if vinfo . isReadOnly == false {
if vinfo . isReadOnly == false {
index , err := idx . FirstInvalidIndex ( fileredBuf . Bytes ( ) , func ( key types . NeedleId , offset types . Offset , size types . Size ) ( bool , error ) {
index , err := idx . FirstInvalidIndex ( fileredBuf . Bytes ( ) ,
func ( key types . NeedleId , offset types . Offset , size types . Size ) ( bool , error ) {
resp , err := volumeServerClient . ReadNeedleMeta ( context . Background ( ) , & volume_server_pb . ReadNeedleMetaRequest {
resp , err := volumeServerClient . ReadNeedleMeta ( context . Background ( ) , & volume_server_pb . ReadNeedleMetaRequest {
VolumeId : volumeId ,
VolumeId : volumeId ,
NeedleId : uint64 ( key ) ,
NeedleId : uint64 ( key ) ,
@ -408,12 +409,12 @@ func (c *commandVolumeFsck) collectOneVolumeFileIds(tempFolder string, dataNodeI
return resp . LastModified <= cutoffFrom , nil
return resp . LastModified <= cutoffFrom , nil
} )
} )
if err != nil {
if err != nil {
fmt . Fprintf ( writer , "Failed to search for last valid index on volume %d with error %v" , volumeId , err )
fmt . Fprintf ( c . writer , "Failed to search for last valid index on volume %d with error %v" , volumeId , err )
} else {
} else {
fileredBuf . Truncate ( index * types . NeedleMapEntrySize )
fileredBuf . Truncate ( index * types . NeedleMapEntrySize )
}
}
}
}
idxFilename := getVolumeFileIdFile ( tempFolder , dataNodeId , volumeId )
idxFilename := getVolumeFileIdFile ( c . tempFolder , dataNodeId , volumeId )
err = writeToFile ( fileredBuf . Bytes ( ) , idxFilename )
err = writeToFile ( fileredBuf . Bytes ( ) , idxFilename )
if err != nil {
if err != nil {
return fmt . Errorf ( "failed to copy %d%s from %s: %v" , volumeId , ext , vinfo . server , err )
return fmt . Errorf ( "failed to copy %d%s from %s: %v" , volumeId , ext , vinfo . server , err )
@ -424,121 +425,79 @@ func (c *commandVolumeFsck) collectOneVolumeFileIds(tempFolder string, dataNodeI
}
}
func ( c * commandVolumeFsck ) collectFilerFileIds ( dataNodeVolumeIdToVInfo map [ string ] map [ uint32 ] VInfo , tempFolder string , writer io . Writer , verbose bool ) error {
if verbose {
fmt . Fprintf ( writer , "collecting file ids from filer ...\n" )
}
files := make ( map [ uint32 ] * os . File )
for _ , volumeIdToServer := range dataNodeVolumeIdToVInfo {
for vid := range volumeIdToServer {
dst , openErr := os . OpenFile ( getFilerFileIdFile ( tempFolder , vid ) , os . O_WRONLY | os . O_CREATE | os . O_TRUNC , 0644 )
if openErr != nil {
return fmt . Errorf ( "failed to create file %s: %v" , getFilerFileIdFile ( tempFolder , vid ) , openErr )
}
files [ vid ] = dst
}
}
defer func ( ) {
for _ , f := range files {
f . Close ( )
}
} ( )
type Item struct {
type Item struct {
vid uint32
vid uint32
fileKey uint64
fileKey uint64
}
return doTraverseBfsAndSaving ( c . env , nil , "/" , false , func ( entry * filer_pb . FullEntry , outputChan chan interface { } ) ( err error ) {
dataChunks , manifestChunks , resolveErr := filer . ResolveChunkManifest ( filer . LookupFn ( c . env ) , entry . Entry . Chunks , 0 , math . MaxInt64 )
if resolveErr != nil {
if verbose {
fmt . Fprintf ( writer , "resolving manifest chunks in %s: %v\n" , util . NewFullPath ( entry . Dir , entry . Entry . Name ) , resolveErr )
}
return nil
}
dataChunks = append ( dataChunks , manifestChunks ... )
for _ , chunk := range dataChunks {
outputChan <- & Item {
vid : chunk . Fid . VolumeId ,
fileKey : chunk . Fid . FileKey ,
}
}
return nil
} , func ( outputChan chan interface { } ) {
buffer := make ( [ ] byte , 8 )
for item := range outputChan {
i := item . ( * Item )
util . Uint64toBytes ( buffer , i . fileKey )
files [ i . vid ] . Write ( buffer )
}
} )
}
func ( c * commandVolumeFsck ) oneVolumeFileIdsCheckOneVolume ( tempFolder string , dataNodeId string , volumeId uint32 , writer io . Writer , verbose bool , applyPurging bool ) ( err error ) {
if verbose {
fmt . Fprintf ( writer , "find missing file chunks in dataNodeId %s volume %d ...\n" , dataNodeId , volumeId )
}
db := needle_map . NewMemDb ( )
defer db . Close ( )
if err = db . LoadFromIdx ( getVolumeFileIdFile ( tempFolder , dataNodeId , volumeId ) ) ; err != nil {
return
cookie uint32
path util . FullPath
}
}
file := getFilerFileIdFile ( tempFolder , volumeId )
fp , err := os . Open ( file )
func ( c * commandVolumeFsck ) readFilerFileIdFile ( volumeId uint32 , fn func ( needleId types . NeedleId , itemPath util . FullPath ) ) error {
fp , err := os . Open ( getFilerFileIdFile ( c . tempFolder , volumeId ) )
if err != nil {
if err != nil {
return
return err
}
}
defer fp . Close ( )
defer fp . Close ( )
type Item struct {
fileKey uint64
cookie uint32
path util . FullPath
}
br := bufio . NewReader ( fp )
br := bufio . NewReader ( fp )
buffer := make ( [ ] byte , 16 )
buffer := make ( [ ] byte , 16 )
item := & Item { }
var readSize int
var readSize int
var readErr error
item := & Item { vid : volumeId }
for {
for {
readSize , err = io . ReadFull ( br , buffer )
if err != nil || readSize != 16 {
readSize , readErr = io . ReadFull ( br , buffer )
if errors . Is ( readErr , io . EOF ) {
break
break
}
}
if readErr != nil {
return readErr
}
if readSize != 16 {
return fmt . Errorf ( "readSize mismatch" )
}
item . fileKey = util . BytesToUint64 ( buffer [ : 8 ] )
item . fileKey = util . BytesToUint64 ( buffer [ : 8 ] )
item . cookie = util . BytesToUint32 ( buffer [ 8 : 12 ] )
item . cookie = util . BytesToUint32 ( buffer [ 8 : 12 ] )
pathSize := util . BytesToUint32 ( buffer [ 12 : 16 ] )
pathSize := util . BytesToUint32 ( buffer [ 12 : 16 ] )
pathBytes := make ( [ ] byte , int ( pathSize ) )
pathBytes := make ( [ ] byte , int ( pathSize ) )
n , err := io . ReadFull ( br , pathBytes )
n , err := io . ReadFull ( br , pathBytes )
if err != nil {
if err != nil {
fmt . Fprintf ( writer , "%d,%x%08x in unexpected error: %v\n" , volumeId , item . fileKey , item . cookie , err )
fmt . Fprintf ( c . writer , "%d,%x%08x in unexpected error: %v\n" , volumeId , item . fileKey , item . cookie , err )
}
}
if n != int ( pathSize ) {
if n != int ( pathSize ) {
fmt . Fprintf ( writer , "%d,%x%08x %d unexpected file name size %d\n" , volumeId , item . fileKey , item . cookie , pathSize , n )
fmt . Fprintf ( c . writer , "%d,%x%08x %d unexpected file name size %d\n" , volumeId , item . fileKey , item . cookie , pathSize , n )
}
}
item . path = util . FullPath ( string ( pathBytes ) )
item . path = util . FullPath ( pathBytes )
needleId := types . NeedleId ( item . fileKey )
needleId := types . NeedleId ( item . fileKey )
if _ , found := db . Get ( needleId ) ; ! found {
fmt . Fprintf ( writer , "%s\n" , item . path )
fn ( needleId , item . path )
}
return nil
}
func ( c * commandVolumeFsck ) oneVolumeFileIdsCheckOneVolume ( dataNodeId string , volumeId uint32 , applyPurging bool ) ( err error ) {
if * c . verbose {
fmt . Fprintf ( c . writer , "find missing file chunks in dataNodeId %s volume %d ...\n" , dataNodeId , volumeId )
}
db := needle_map . NewMemDb ( )
defer db . Close ( )
if err = db . LoadFromIdx ( getVolumeFileIdFile ( c . tempFolder , dataNodeId , volumeId ) ) ; err != nil {
return
}
if err = c . readFilerFileIdFile ( volumeId , func ( needleId types . NeedleId , itemPath util . FullPath ) {
if _ , found := db . Get ( needleId ) ; ! found {
fmt . Fprintf ( c . writer , "%s\n" , itemPath )
if applyPurging {
if applyPurging {
// defining the URL this way automatically escapes complex path names
c . httpDelete ( item . path , verbose )
c . httpDelete ( itemPath )
}
}
}
}
} ) ; err != nil {
return
}
}
return nil
return nil
}
}
func ( c * commandVolumeFsck ) httpDelete ( path util . FullPath , verbose bool ) {
func ( c * commandVolumeFsck ) httpDelete ( path util . FullPath ) {
req , err := http . NewRequest ( http . MethodDelete , "" , nil )
req , err := http . NewRequest ( http . MethodDelete , "" , nil )
req . URL = & url . URL {
req . URL = & url . URL {
@ -546,69 +505,66 @@ func (c *commandVolumeFsck) httpDelete(path util.FullPath, verbose bool) {
Host : c . env . option . FilerAddress . ToHttpAddress ( ) ,
Host : c . env . option . FilerAddress . ToHttpAddress ( ) ,
Path : string ( path ) ,
Path : string ( path ) ,
}
}
if verbose {
fmt . Printf ( "full HTTP delete request to be sent: %v\n" , req )
if * c . verbose {
fmt . Fprintf ( c . writer , "full HTTP delete request to be sent: %v\n" , req )
}
}
if err != nil {
if err != nil {
fmt . Errorf ( "HTTP delete request error: %v\n" , err )
fmt . Fprintf ( c . writer , "HTTP delete request error: %v\n" , err )
}
}
client := & http . Client { }
client := & http . Client { }
resp , err := client . Do ( req )
resp , err := client . Do ( req )
if err != nil {
if err != nil {
fmt . Errorf ( "DELETE fetch error: %v\n" , err )
fmt . Fprintf ( c . writer , "DELETE fetch error: %v\n" , err )
}
}
defer resp . Body . Close ( )
defer resp . Body . Close ( )
_ , err = ioutil . ReadAll ( resp . Body )
_ , err = ioutil . ReadAll ( resp . Body )
if err != nil {
if err != nil {
fmt . Errorf ( "DELETE response error: %v\n" , err )
fmt . Fprintf ( c . writer , "DELETE response error: %v\n" , err )
}
}
if verbose {
fmt . Println ( "delete response Status : " , resp . Status )
fmt . Println ( "delete response Headers : " , resp . Header )
if * c . verbose {
fmt . Fprintln ( c . writer , "delete response Status : " , resp . Status )
fmt . Fprintln ( c . writer , "delete response Headers : " , resp . Header )
}
}
}
}
func ( c * commandVolumeFsck ) oneVolumeFileIdsSubtractFilerFileIds ( tempFolder string , dataNodeId string , volumeId uint32 , writer io . Writer , verbose bool ) ( inUseCount uint64 , orphanFileIds [ ] string , orphanDataSize uint64 , err error ) {
func ( c * commandVolumeFsck ) oneVolumeFileIdsSubtractFilerFileIds ( dataNodeId string , volumeId uint32 , collection string ) ( inUseCount uint64 , orphanFileIds [ ] string , orphanDataSize uint64 , err error ) {
db := needle_map . NewMemDb ( )
db := needle_map . NewMemDb ( )
defer db . Close ( )
defer db . Close ( )
if err = db . LoadFromIdx ( getVolumeFileIdFile ( tempFolder , dataNodeId , volumeId ) ) ; err != nil {
if err = db . LoadFromIdx ( getVolumeFileIdFile ( c . tempFolder , dataNodeId , volumeId ) ) ; err != nil {
err = fmt . Errorf ( "failed to LoadFromIdx %+v" , err )
return
return
}
}
filerFileIdsData , err := os . ReadFile ( getFilerFileIdFile ( tempFolder , volumeId ) )
if err != nil {
return
if err = c . readFilerFileIdFile ( volumeId , func ( nId types . NeedleId , itemPath util . FullPath ) {
if err = db . Delete ( nId ) ; err != nil && * c . verbose {
fmt . Fprintf ( c . writer , "failed to nm.delete %s(%+v): %+v" , itemPath , nId , err )
}
}
dataLen := len ( filerFileIdsData )
if dataLen % 8 != 0 {
return 0 , nil , 0 , fmt . Errorf ( "filer data is corrupted" )
}
for i := 0 ; i < len ( filerFileIdsData ) ; i += 8 {
fileKey := util . BytesToUint64 ( filerFileIdsData [ i : i + 8 ] )
db . Delete ( types . NeedleId ( fileKey ) )
inUseCount ++
inUseCount ++
} ) ; err != nil {
err = fmt . Errorf ( "failed to readFilerFileIdFile %+v" , err )
return
}
}
var orphanFileCount uint64
var orphanFileCount uint64
db . AscendingVisit ( func ( n needle_map . NeedleValue ) error {
// fmt.Printf("%d,%x\n", volumeId, n.Key)
orphanFileIds = append ( orphanFileIds , fmt . Sprintf ( "%d,%s00000000" , volumeId , n . Key . String ( ) ) )
if err = db . AscendingVisit ( func ( n needle_map . NeedleValue ) error {
orphanFileIds = append ( orphanFileIds , fmt . Sprintf ( "%s:%d,%s00000000" , collection , volumeId , n . Key . String ( ) ) )
orphanFileCount ++
orphanFileCount ++
orphanDataSize += uint64 ( n . Size )
orphanDataSize += uint64 ( n . Size )
return nil
return nil
} )
} ) ; err != nil {
err = fmt . Errorf ( "failed to AscendingVisit %+v" , err )
return
}
if orphanFileCount > 0 {
if orphanFileCount > 0 {
pct := float64 ( orphanFileCount * 100 ) / ( float64 ( orphanFileCount + inUseCount ) )
pct := float64 ( orphanFileCount * 100 ) / ( float64 ( orphanFileCount + inUseCount ) )
fmt . Fprintf ( writer , "dataNode:%s\tvolume:%d\tentries:%d\torphan:%d\t%.2f%%\t%dB\n" ,
fmt . Fprintf ( c . writer , "dataNode:%s\tvolume:%d\tentries:%d\torphan:%d\t%.2f%%\t%dB\n" ,
dataNodeId , volumeId , orphanFileCount + inUseCount , orphanFileCount , pct , orphanDataSize )
dataNodeId , volumeId , orphanFileCount + inUseCount , orphanFileCount , pct , orphanDataSize )
}
}
@ -623,15 +579,15 @@ type VInfo struct {
isReadOnly bool
isReadOnly bool
}
}
func ( c * commandVolumeFsck ) collectVolumeIds ( commandEnv * CommandEnv , verbose bool , writer io . Writer ) ( volumeIdToServer map [ string ] map [ uint32 ] VInfo , err error ) {
func ( c * commandVolumeFsck ) collectVolumeIds ( ) ( volumeIdToServer map [ string ] map [ uint32 ] VInfo , err error ) {
if verbose {
fmt . Fprintf ( writer , "collecting volume id and locations from master ...\n" )
if * c . verbose {
fmt . Fprintf ( c . writer , "collecting volume id and locations from master ...\n" )
}
}
volumeIdToServer = make ( map [ string ] map [ uint32 ] VInfo )
volumeIdToServer = make ( map [ string ] map [ uint32 ] VInfo )
// collect topology information
// collect topology information
topologyInfo , _ , err := collectTopologyInfo ( commandE nv , 0 )
topologyInfo , _ , err := collectTopologyInfo ( c . e nv, 0 )
if err != nil {
if err != nil {
return
return
}
}
@ -656,17 +612,16 @@ func (c *commandVolumeFsck) collectVolumeIds(commandEnv *CommandEnv, verbose boo
isReadOnly : true ,
isReadOnly : true ,
}
}
}
}
if * c . verbose {
fmt . Fprintf ( c . writer , "dn %+v collected %d volumes and locations.\n" , dataNodeId , len ( volumeIdToServer [ dataNodeId ] ) )
}
}
} )
if verbose {
fmt . Fprintf ( writer , "collected %d volumes and locations.\n" , len ( volumeIdToServer ) )
}
}
} )
return
return
}
}
func ( c * commandVolumeFsck ) purgeFileIdsForOneVolume ( volumeId uint32 , fileIds [ ] string , writer io . Writer ) ( err error ) {
fmt . Fprintf ( writer , "purging orphan data for volume %d...\n" , volumeId )
func ( c * commandVolumeFsck ) purgeFileIdsForOneVolume ( volumeId uint32 , fileIds [ ] string ) ( err error ) {
fmt . Fprintf ( c . writer , "purging orphan data for volume %d...\n" , volumeId )
locations , found := c . env . MasterClient . GetLocations ( volumeId )
locations , found := c . env . MasterClient . GetLocations ( volumeId )
if ! found {
if ! found {
return fmt . Errorf ( "failed to find volume %d locations" , volumeId )
return fmt . Errorf ( "failed to find volume %d locations" , volumeId )
@ -693,7 +648,7 @@ func (c *commandVolumeFsck) purgeFileIdsForOneVolume(volumeId uint32, fileIds []
for results := range resultChan {
for results := range resultChan {
for _ , result := range results {
for _ , result := range results {
if result . Error != "" {
if result . Error != "" {
fmt . Fprintf ( writer , "purge error: %s\n" , result . Error )
fmt . Fprintf ( c . writer , "purge error: %s\n" , result . Error )
}
}
}
}
}
}
@ -701,6 +656,13 @@ func (c *commandVolumeFsck) purgeFileIdsForOneVolume(volumeId uint32, fileIds []
return
return
}
}
func ( c * commandVolumeFsck ) getCollectFilerFilePath ( ) string {
if * c . collection != "" {
return fmt . Sprintf ( "%s/%s" , c . bucketsPath , * c . collection )
}
return "/"
}
func getVolumeFileIdFile ( tempFolder string , dataNodeid string , vid uint32 ) string {
func getVolumeFileIdFile ( tempFolder string , dataNodeid string , vid uint32 ) string {
return filepath . Join ( tempFolder , fmt . Sprintf ( "%s_%d.idx" , dataNodeid , vid ) )
return filepath . Join ( tempFolder , fmt . Sprintf ( "%s_%d.idx" , dataNodeid , vid ) )
}
}